<a href="https://colab.research.google.com/github/DeepaliSaini4/Machine-Learning/blob/main/MiissingValueDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load California Housing dataset from a reliable URL
url = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv'
df = pd.read_csv(url)

# Display basic info
print(f"Dataset shape: {df.shape}")
print(df.head(3))

# Identify missing values
missing_counts = df.isnull().sum()
missing_cols = missing_counts[missing_counts > 0]
print(f"Columns with missing values and counts:\n{missing_cols}")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Impute missing values
# Numerical columns: median imputation
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Categorical columns: mode imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Verify no missing values remain
missing_after = df.isnull().sum().sum()
print(f"Total missing values after imputation: {missing_after}")

# Show sample after imputation
print(df.head(3))



Dataset shape: (20640, 10)
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
Columns with missing values and counts:
total_bedrooms    207
dtype: int64
Total missing values after imputation: 0
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24   