In [1]:
pip install numpy pandas matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [3]:
df = pd.read_csv("data/dataset.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

=== DATASET OVERVIEW ===
Shape: (20640, 10)
Columns: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']


In [5]:
print("\n=== DATA TYPES ===")
print(df.dtypes)


=== DATA TYPES ===
longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object


In [6]:
print("\n=== MISSING VALUES ===")
print(df.isnull().sum())


=== MISSING VALUES ===
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [7]:
print(f"\nMissing values percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))


Missing values percentage:
longitude             0.0
latitude              0.0
housing_median_age    0.0
total_rooms           0.0
total_bedrooms        1.0
population            0.0
households            0.0
median_income         0.0
median_house_value    0.0
ocean_proximity       0.0
dtype: float64


In [8]:
print("\n=== DUPLICATE ROWS ===")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


=== DUPLICATE ROWS ===
Number of duplicate rows: 0


In [9]:
print("\n=== FIRST 5 ROWS ===")
print(df.head())

print("\n=== STATISTICAL SUMMARY ===")
print(df.describe())


=== FIRST 5 ROWS ===
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

=== STATISTICAL SUMMARY ===
          longitude     

In [10]:
# Check how many missing values in total_bedrooms
missing_count = df['total_bedrooms'].isnull().sum()
print(f"Missing values in total_bedrooms: {missing_count}")

Missing values in total_bedrooms: 207


In [11]:
# Method A: Median imputation (simplest)
median_bedrooms = df['total_bedrooms'].median()
df['total_bedrooms_median'] = df['total_bedrooms'].fillna(median_bedrooms)
# Replace the original total_bedrooms with median-imputed values
df['total_bedrooms'] = df['total_bedrooms_median']
# Drop the temporary column (optional, for cleanup)
df.drop('total_bedrooms_median', axis=1, inplace=True)

In [12]:
# Verify imputation worked
print(f"Missing values after imputation: {df['total_bedrooms'].isnull().sum()}")
print(f"Total records: {len(df)}")
print(f"Median value used: {median_bedrooms}")


Missing values after imputation: 0
Total records: 20640
Median value used: 435.0


In [13]:
# Check the distribution before and after (if you want to compare)
print("=== IMPUTATION VALIDATION ===")
print("Total_bedrooms statistics after imputation:")
print(df['total_bedrooms'].describe())

# Check if any missing values remain
print(f"\nFinal missing values check across all columns:")
print(df.isnull().sum())

=== IMPUTATION VALIDATION ===
Total_bedrooms statistics after imputation:
count    20640.000000
mean       536.838857
std        419.391878
min          1.000000
25%        297.000000
50%        435.000000
75%        643.250000
max       6445.000000
Name: total_bedrooms, dtype: float64

Final missing values check across all columns:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [14]:
# Save the dataset with imputed values
df.to_csv('data/dataset_imputed.csv', index=False)
print("✅ Dataset saved as 'dataset_imputed.csv'")

✅ Dataset saved as 'dataset_imputed.csv'


## Preprocessing: impute remaining missings, encode categoricals, scale numerics, and save processed dataset

In [None]:
# Load the imputed dataset and perform preprocessing
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

df_proc = pd.read_csv('data/dataset_imputed.csv')
print('Initial missing values (per column):')
print(df_proc.isnull().sum())

# Identify numeric and categorical columns
num_cols = df_proc.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_proc.select_dtypes(include=['object','category']).columns.tolist()
print(f'Numeric cols: {num_cols}')
print(f'Categorical cols: {cat_cols}')

# Impute numeric columns with median if any missing remain
if df_proc[num_cols].isnull().any().any():
    imp = SimpleImputer(strategy='median')
    df_proc[num_cols] = imp.fit_transform(df_proc[num_cols])
    print('Applied median imputation to numeric columns')
else:
    print('No numeric missing values to impute')

# Encode categorical columns (one-hot)
if len(cat_cols) > 0:
    df_proc = pd.get_dummies(df_proc, columns=cat_cols, drop_first  =True)
    print('Applied one-hot encoding to categorical columns')
else:
    print('No categorical columns to encode')

# Scale numeric features
scaler = StandardScaler()
df_proc[num_cols] = scaler.fit_transform(df_proc[num_cols])
print('Scaled numeric features with StandardScaler')

# Save processed dataset
df_proc.to_csv('data/dataset_processed.csv', index=False)
print('Saved processed dataset to data/dataset_processed.csv')
print('Final shape:', df_proc.shape)

Initial missing values (per column):
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64
Numeric cols: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
Categorical cols: ['ocean_proximity']
No numeric missing values to impute
Applied one-hot encoding to categorical columns
Scaled numeric features with StandardScaler
Saved processed dataset to data/dataset_processed.csv
Final shape: (20640, 13)


In [16]:
# Quick peek at processed data
import pandas as pd
df_check = pd.read_csv('data/dataset_processed.csv')
print(df_check.head())
print('Missing values after processing (per column):')
print(df_check.isnull().sum())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.327835  1.052548            0.982143    -0.804819       -0.972476   
1  -1.322844  1.043185           -0.607019     2.045890        1.357143   
2  -1.332827  1.038503            1.856182    -0.535746       -0.827024   
3  -1.337818  1.038503            1.856182    -0.624215       -0.719723   
4  -1.337818  1.038503            1.856182    -0.462404       -0.612423   

   population  households  median_income  median_house_value  \
0   -0.974429   -0.977033       2.344766            2.129631   
1    0.861439    1.669961       2.332238            1.314156   
2   -0.820777   -0.843637       1.782699            1.258693   
3   -0.766028   -0.733781       0.932968            1.165100   
4   -0.759847   -0.629157      -0.012881            1.172900   

   ocean_proximity_INLAND  ocean_proximity_ISLAND  ocean_proximity_NEAR BAY  \
0                   False                   False                      True   
1     