In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
import os

# Load the cleaned data
df = pd.read_csv('../../data/processed/cleaned_data.csv')

# Numeric columns (excluding target)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('median_house_value').tolist()
print("Numeric columns:", numeric_cols)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("Numeric columns after scaling:\n", df[numeric_cols].head())

os.makedirs('../src/scalers', exist_ok=True)

with open('../src/scalers/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Scaler saved successfully!")

os.makedirs('../../data/processed/', exist_ok=True)

df.to_csv('../../data/processed/scaled_data.csv', index=False)
print("Scaled data saved successfully!")


Numeric columns: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
Numeric columns after scaling:
    longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.327835  1.052548            0.982143    -1.117285       -1.309916   
1  -1.322844  1.043185           -0.607019     2.329936        2.128690   
2  -1.332827  1.038503            1.856182    -0.697327       -1.095223   
3  -1.337818  1.038503            1.856182    -0.835405       -0.936843   
4  -1.337818  1.038503            1.856182    -0.582857       -0.778463   

   population  households  median_income  
0   -1.325821   -1.291972       2.541006  
1    1.389936    2.348314       2.541006  
2   -1.098528   -1.099883       2.085156  
3   -1.017539   -0.941691       1.111288  
4   -1.008395   -0.791033       0.027262  
Scaler saved successfully!
Scaled data saved successfully!
