In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import os

# Load scaled data
df = pd.read_csv('../../data/processed/scaled_data.csv')

# Separate numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,-1.117285,-1.309916,-1.325821,-1.291972,2.541006,2.129631,NEAR BAY
1,-1.322844,1.043185,-0.607019,2.329936,2.12869,1.389936,2.348314,2.541006,1.314156,NEAR BAY
2,-1.332827,1.038503,1.856182,-0.697327,-1.095223,-1.098528,-1.099883,2.085156,1.258693,NEAR BAY
3,-1.337818,1.038503,1.856182,-0.835405,-0.936843,-1.017539,-0.941691,1.111288,1.1651,NEAR BAY
4,-1.337818,1.038503,1.856182,-0.582857,-0.778463,-1.008395,-0.791033,0.027262,1.1729,NEAR BAY


In [2]:
# Create polynomial features (degree=2)
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
df_poly = poly.fit_transform(df[numeric_cols])

print("Original shape:", df[numeric_cols].shape)
print("Polynomial features shape:", df_poly.shape)

Original shape: (20640, 9)
Polynomial features shape: (20640, 54)


In [3]:
skewed_cols = df[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_cols = skewed_cols[abs(skewed_cols) > 0.5].index.tolist()  # threshold skew > 0.5

for col in skewed_cols:
    df[col] = df[col].apply(lambda x: np.log1p(x))  # log(1+x) to handle zeros

print("Applied log transformation to skewed columns:", skewed_cols)

Applied log transformation to skewed columns: ['median_house_value', 'total_bedrooms', 'households', 'total_rooms', 'population', 'median_income']


  df[col] = df[col].apply(lambda x: np.log1p(x))  # log(1+x) to handle zeros


In [4]:
os.makedirs('../../data/processed', exist_ok=True)

df.to_csv('../../data/processed/transformed_data.csv', index=False)
print("Transformed dataset saved successfully!")

Transformed dataset saved successfully!
