In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import pickle

# Load the cleaned dataset
df = pd.read_csv('../../data/processed/cleaned_data.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.013025,452600.0,NEAR BAY
1,-122.22,37.86,21.0,5698.375,1106.0,2401.0,1092.5,8.013025,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# Identify column types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numeric Columns:", numeric_cols)

Categorical Columns: ['ocean_proximity']
Numeric Columns: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']


In [5]:
# Using ColumnTransformer to handle encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)  # drop first to avoid dummy variable trap
    ],
    remainder='passthrough'  # keep numeric columns as-is
)

X_encoded = preprocessor.fit_transform(df)
print("Shape after encoding:", X_encoded.shape)

Shape after encoding: (20640, 13)


In [6]:

import os

os.makedirs('../src/encoders', exist_ok=True)

with open('../src/encoders/encoder.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Encoder saved successfully!")


Encoder saved successfully!
