In [11]:
DF_PATH = "../data/processed/02_preprocessed_df.pkl"
EXPORT_PATH = "../data/processed/03_X_train_scaled_df.pkl"
EXPORT_PATH1 = "../data/processed/04_X_val_scaled_df.pkl"
EXPORT_PATH2 = "../data/processed/05_y_train_df.pkl"
EXPORT_PATH3 = "../data/processed/06_y_val_df.pkl"
EXPORT_PATH4 = "../data/processed/07_test_df.pkl"
EXPORT_PATH5 = "../data/processed/08_test_scaled_df.pkl"


In [12]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [13]:
# load data
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_house,bedrooms_ratio,people_per_house,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467,0.0,0.0,0.0,1.0,0.0
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,4.761658,0.231774,2.139896,0.0,0.0,0.0,1.0,0.0
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,4.931907,0.192899,2.128405,0.0,0.0,0.0,1.0,0.0
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,4.797527,0.221327,1.788253,0.0,0.0,0.0,1.0,0.0


--------

### Split data

In [14]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

print(f'--> train: {train.shape}')
print(f'--> test: {test.shape}')

print(train.columns)
print(test.columns)

test_df = test.drop(columns=['median_house_value'])
print(test_df.shape)

--> train: (14011, 17)
--> test: (3503, 17)
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'rooms_per_house', 'bedrooms_ratio',
       'people_per_house', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype='object')
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'rooms_per_house', 'bedrooms_ratio',
       'people_per_house', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype='object')
(3503, 16)


##### split train set 

So now le's work with train data and split it into X and y

In [15]:
# removing target variable
X = train.drop('median_house_value', axis=1)
y = train['median_house_value']

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'--> X_train: {X_train.shape}')
print(f'--> X_val: {X_val.shape}')
print(f'--> y_train: {y_train.shape}')
print(f'--> y_val: {y_val.shape}')

--> X_train: (11208, 16)
--> X_val: (2803, 16)
--> y_train: (11208,)
--> y_val: (2803,)


---------

### Feature Scaling

>* Min-max scaling (many people call this normalization) is quite simple: values are shifted and rescaled so that they end up ranging from 0 to 1. We do this by subtracting the min value and dividing by the max minus the min. Scikit-Learn provides a transformer called MinMaxScaler for this. It has a feature_range hyperparameter that lets you change the range if you don’t want 0–1 for some reason.

>* Standardization is quite different: first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the variance so that the resulting distribution has unit variance.


Unlike min-max scaling, standardization does not bound values to a specific range, which may be a problem for some algorithms (e.g.,
neural networks often expect an input value ranging from 0 to 1). However, standardization is much less affected by outliers.

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled_df = scaler.transform(test_df)

---------

### EXPORT


In [18]:

# save train data as pickle
pd.to_pickle(X_train_scaled, EXPORT_PATH)
pd.to_pickle(X_val_scaled, EXPORT_PATH1)
pd.to_pickle(y_train, EXPORT_PATH2)
pd.to_pickle(y_val, EXPORT_PATH3)

# save test data as pickle
pd.to_pickle(test_df, EXPORT_PATH4)
pd.to_pickle(test_scaled_df, EXPORT_PATH5)
