# Data Manipulation with Pandas

In [6]:
import pandas as pd
import numpy as np

# Step 1: Create a sample dataset with missing values
np.random.seed(42)  # For reproducibility

# Creating a sample DataFrame with 10 rows and 5 columns
data = {
    'ID': range(1, 11),
    'Price': [3000000, 4500000, np.nan, 5500000, 5000000, np.nan, 6200000, 4700000, 5100000, np.nan],
    'Bedrooms': [3, 4, 3, np.nan, 4, 2, 5, 3, np.nan, 3],
    'Bathrooms': [2, 3, 2, 2, 3, np.nan, 3, 2, 2, 3],
    'Living Area': [1200, 1500, 1400, 1600, 1800, 1100, 1900, 1500, 1400, 1300]
}

df = pd.DataFrame(data)


In [7]:

# Step 2: Display the dataset with missing values
print("Original dataset with missing values:")
print(df)

Original dataset with missing values:
   ID      Price  Bedrooms  Bathrooms  Living Area
0   1  3000000.0       3.0        2.0         1200
1   2  4500000.0       4.0        3.0         1500
2   3        NaN       3.0        2.0         1400
3   4  5500000.0       NaN        2.0         1600
4   5  5000000.0       4.0        3.0         1800
5   6        NaN       2.0        NaN         1100
6   7  6200000.0       5.0        3.0         1900
7   8  4700000.0       3.0        2.0         1500
8   9  5100000.0       NaN        2.0         1400
9  10        NaN       3.0        3.0         1300


In [8]:

# Step 3: Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())



Missing values in each column:
ID             0
Price          3
Bedrooms       2
Bathrooms      1
Living Area    0
dtype: int64


In [9]:

# Step 4: Handle missing values
# Option 1: Drop rows with missing values
df_cleaned_dropna = df.dropna()


# Option 2: Fill missing values with the mean of the respective columns (for numerical columns)
df_cleaned_fillna = df.fillna(df.mean())


In [10]:

# Step 5: Filtering data
# Example: Filter houses where the price is greater than ₹50 Lakhs
df_filtered = df[df['Price'] > 5000000]
print("\nFiltered houses with price > 50 Lakhs:")
print(df_filtered)



Filtered houses with price > 50 Lakhs:
   ID      Price  Bedrooms  Bathrooms  Living Area
3   4  5500000.0       NaN        2.0         1600
6   7  6200000.0       5.0        3.0         1900
8   9  5100000.0       NaN        2.0         1400


In [11]:

# Step 6: Group data by 'Bedrooms' and calculate the average price
df_bedrooms_group = df.groupby('Bedrooms')['Price'].mean().reset_index()
print("\nAverage price by number of bedrooms:")
print(df_bedrooms_group)



Average price by number of bedrooms:
   Bedrooms      Price
0       2.0        NaN
1       3.0  3850000.0
2       4.0  4750000.0
3       5.0  6200000.0


In [12]:


# Step 7: Group data by 'Bathrooms' and calculate the average price
df_bathrooms_group = df.groupby('Bathrooms')['Price'].mean().reset_index()
print("\nAverage price by number of bathrooms:")
print(df_bathrooms_group)



Average price by number of bathrooms:
   Bathrooms         Price
0        2.0  4.575000e+06
1        3.0  5.233333e+06


In [13]:

# Step 8: Calculate summary statistics
print("\nSummary statistics for the dataset:")
print(df.describe())



Summary statistics for the dataset:
             ID         Price  Bedrooms  Bathrooms  Living Area
count  10.00000  7.000000e+00  8.000000   9.000000    10.000000
mean    5.50000  4.857143e+06  3.375000   2.444444  1470.000000
std     3.02765  9.913915e+05  0.916125   0.527046   249.666444
min     1.00000  3.000000e+06  2.000000   2.000000  1100.000000
25%     3.25000  4.600000e+06  3.000000   2.000000  1325.000000
50%     5.50000  5.000000e+06  3.000000   2.000000  1450.000000
75%     7.75000  5.300000e+06  4.000000   3.000000  1575.000000
max    10.00000  6.200000e+06  5.000000   3.000000  1900.000000


In [14]:

# Step 9: Save the cleaned data to new CSV files
df_cleaned_dropna.to_csv('cleaned_data_dropna.csv', index=False)
df_cleaned_fillna.to_csv('cleaned_data_fillna.csv', index=False)

print("\nCleaned data saved as 'cleaned_data_dropna.csv' and 'cleaned_data_fillna.csv'")



Cleaned data saved as 'cleaned_data_dropna.csv' and 'cleaned_data_fillna.csv'
