In [2]:
import pandas as pd
import numpy as np

# Step 1: Load a dataset into a Pandas DataFrame
# For the purpose of this example, let's create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', None, 'Hannah', 'Isaac', 'Jack'],
    'Age': [25, np.nan, 30, 22, np.nan, 35, 28, 26, 40, 21],
    'City': ['New York', 'Los Angeles', None, 'Chicago', 'Miami', 'New York', 'Chicago', None, 'Miami', 'Los Angeles'],
    'Salary': [70000, 80000, np.nan, 62000, 70000, None, 58000, 85000, 90000, 52000]
}
df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
1,Bob,,Los Angeles,80000.0
2,Charlie,30.0,,
3,David,22.0,Chicago,62000.0
4,Eva,,Miami,70000.0


In [4]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Salary'].fillna(df['Salary'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(),inplace=True)


In [6]:
df.drop(columns='City',inplace=True)

In [7]:
df.head()

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,70000.0
1,Bob,28.375,80000.0
2,Charlie,30.0,70875.0
3,David,22.0,62000.0
4,Eva,28.375,70000.0


In [8]:
grouped = df.groupby('Name').agg({
'Age':'mean',
'Salary':'sum'
}).reset_index()

In [9]:
summary_stats = df.describe()

In [10]:
# Step 5: Display the first 10 rows of the DataFrame
print("First 10 rows of the DataFrame:")
print(df.head(10))

# Display grouped data
print("\nGrouped Data:")
print(grouped)

# Display summary statistics
print("\nSummary Statistics:")
print(summary_stats)

# Basic data manipulation
# Sorting by Salary in descending order
sorted_df = df.sort_values(by='Salary', ascending=False)

# Filtering data where Age is greater than 25
filtered_df = df[df['Age'] > 25]

print("\nSorted DataFrame by Salary:")
print(sorted_df)

print("\nFiltered DataFrame (Age > 25):")
print(filtered_df)

First 10 rows of the DataFrame:
      Name     Age   Salary
0    Alice  25.000  70000.0
1      Bob  28.375  80000.0
2  Charlie  30.000  70875.0
3    David  22.000  62000.0
4      Eva  28.375  70000.0
5    Frank  35.000  70875.0
6     None  28.000  58000.0
7   Hannah  26.000  85000.0
8    Isaac  40.000  90000.0
9     Jack  21.000  52000.0

Grouped Data:
      Name     Age   Salary
0    Alice  25.000  70000.0
1      Bob  28.375  80000.0
2  Charlie  30.000  70875.0
3    David  22.000  62000.0
4      Eva  28.375  70000.0
5    Frank  35.000  70875.0
6   Hannah  26.000  85000.0
7    Isaac  40.000  90000.0
8     Jack  21.000  52000.0

Summary Statistics:
             Age        Salary
count  10.000000     10.000000
mean   28.375000  70875.000000
std     5.714261  11789.237088
min    21.000000  52000.000000
25%    25.250000  64000.000000
50%    28.187500  70437.500000
75%    29.593750  77718.750000
max    40.000000  90000.000000

Sorted DataFrame by Salary:
      Name     Age   Salary
8    Isa