In [1]:
# Dataset is already in repo folder 'Data'

import pandas as pd

raw = pd.read_csv("../Data/00 - raw.csv")
raw.columns = [col.lower() for col in raw.columns]
raw.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [2]:
# Encode and drop 'rad'
encoded = pd.get_dummies(raw.rad,dtype=int)
unencoded = raw.drop("rad",axis=1)

raw_enc = pd.concat([unencoded,encoded],axis=1)
raw_enc.columns = raw_enc.columns.astype(str)           # The numeric encoded headers will throw an error during GridSearch later
raw_enc.drop_duplicates(inplace=True)             # Just to be sure
raw_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   tax      506 non-null    float64
 9   ptratio  506 non-null    float64
 10  b        506 non-null    float64
 11  lstat    506 non-null    float64
 12  medv     506 non-null    float64
 13  1        506 non-null    int64  
 14  2        506 non-null    int64  
 15  3        506 non-null    int64  
 16  4        506 non-null    int64  
 17  5        506 non-null    int64  
 18  6        506 non-null    int64  
 19  7        506 non-null    int64  
 20  8        506 non-null    int64  
 21  24       506 non

In [3]:
raw_enc.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crim,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677083,88.9762
zn,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
indus,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
chas,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nox,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
rm,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
age,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
dis,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
tax,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0
ptratio,506.0,18.455534,2.164946,12.6,17.4,19.05,20.2,22.0


In [4]:
# 'age' is a proportion according to data dictionary but it is in percentage format rather than decimal
raw_enc['age'] = raw_enc['age'] / 100
raw_enc["age"].describe().T

count    506.000000
mean       0.685749
std        0.281489
min        0.029000
25%        0.450250
50%        0.775000
75%        0.940750
max        1.000000
Name: age, dtype: float64

In [11]:
# Subset data into X, y then train/test split
from sklearn.model_selection import train_test_split

X = raw_enc.drop("medv",axis=1)
y = raw_enc["medv"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [12]:
# Perform log transformation first for extremely non-linear
from sklearn.preprocessing import FunctionTransformer
import numpy as np

log_trans = ["b", "tax"]

X_train[log_trans] = FunctionTransformer(np.log).fit_transform(X_train[log_trans])

X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crim,404.0,3.609125,8.875058,0.00906,0.081437,0.26139,3.202962,88.9762
zn,404.0,11.569307,23.152481,0.0,0.0,0.0,20.0,100.0
indus,404.0,10.98505,6.894618,0.74,5.13,8.56,18.1,27.74
chas,404.0,0.071782,0.258447,0.0,0.0,0.0,0.0,1.0
nox,404.0,0.556484,0.117704,0.385,0.452,0.538,0.631,0.871
rm,404.0,6.315891,0.709452,3.863,5.8905,6.21,6.63675,8.78
age,404.0,0.685564,0.279949,0.029,0.4555,0.777,0.9365,1.0
dis,404.0,3.808195,2.131226,1.1296,2.087875,3.17575,5.4008,12.1265
tax,404.0,5.922294,0.392908,5.231109,5.631212,5.799093,6.50129,6.566672
ptratio,404.0,18.318317,2.228701,12.6,16.8,18.7,20.2,22.0


In [13]:
# Scale X_train data
# All inputs will be transformed but only X_train is fit
# Only scale measure data less age!

nums = X_train.select_dtypes(include=float).columns.to_list()
nums.remove('age')
X_train[nums].info()

<class 'pandas.core.frame.DataFrame'>
Index: 404 entries, 477 to 102
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     404 non-null    float64
 1   zn       404 non-null    float64
 2   indus    404 non-null    float64
 3   nox      404 non-null    float64
 4   rm       404 non-null    float64
 5   dis      404 non-null    float64
 6   tax      404 non-null    float64
 7   ptratio  404 non-null    float64
 8   b        404 non-null    float64
 9   lstat    404 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [14]:
# Verify that the features being excluded from scaling
non_nums = X_train.select_dtypes(exclude=float).columns.to_list()
non_nums.append("age")
X_train[non_nums].info()

<class 'pandas.core.frame.DataFrame'>
Index: 404 entries, 477 to 102
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   chas    404 non-null    int64  
 1   1       404 non-null    int64  
 2   2       404 non-null    int64  
 3   3       404 non-null    int64  
 4   4       404 non-null    int64  
 5   5       404 non-null    int64  
 6   6       404 non-null    int64  
 7   7       404 non-null    int64  
 8   8       404 non-null    int64  
 9   24      404 non-null    int64  
 10  age     404 non-null    float64
dtypes: float64(1), int64(10)
memory usage: 37.9 KB


In [15]:
# Scale X features that are float, since all ints are boolean
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_X_train = X_train.copy()
scaled_X_train[nums] = scaler.fit_transform(scaled_X_train[nums])   # Fit/scale only to numeric training data

scaled_X_test = X_test.copy()
scaled_X_test[log_trans] = FunctionTransformer(np.log).transform(scaled_X_test[log_trans])
scaled_X_test[nums] = scaler.transform(scaled_X_test[nums])         # Scale only numeric test data     

scaled_X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crim,404.0,-2.6381540000000003e-17,1.00124,-0.406141,-0.397976,-0.377675,-0.045821,9.630689
zn,404.0,0.0,1.00124,-0.50032,-0.50032,-0.50032,0.364589,3.824227
indus,404.0,-4.177077e-17,1.00124,-1.487791,-0.850273,-0.352167,1.033237,2.433163
chas,404.0,0.07178218,0.258447,0.0,0.0,0.0,0.0,1.0
nox,404.0,-5.111423e-16,1.00124,-1.458711,-0.888783,-0.157233,0.633861,2.675394
rm,404.0,-2.616169e-16,1.00124,-3.461733,-0.600349,-0.149443,0.452824,3.477564
age,404.0,0.6855644,0.279949,0.029,0.4555,0.777,0.9365,1.0
dis,404.0,5.2488270000000007e-17,1.00124,-1.258391,-0.808198,-0.29712,0.748198,3.9079
tax,404.0,2.782703e-15,1.00124,-1.761337,-0.741761,-0.313953,1.475444,1.642057
ptratio,404.0,8.519038e-16,1.00124,-2.568944,-0.682101,0.17147,0.845343,1.65399


In [16]:
# Export all datasets
data_path = "../Data/"

scaled_X_train.to_csv(data_path + "X_train.csv", index=False)
scaled_X_test.to_csv(data_path + "X_test.csv", index=False)
y_train.to_csv(data_path + "y_train.csv", index=False)
y_test.to_csv(data_path + "y_test.csv", index=False)