In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

### 1. Data Preparation

1. Split data into train and test sets ('X' and 'y')
2. Filling/inputing or discarding missing values
3. Converting non-numerical values to numerical values aka feature encoding

In [2]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X = heart_disease.drop(["target"], axis = 1)
y = heart_disease["target"]

In [4]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [5]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

#### 1.1 Making everything numerical

In [8]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [9]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [10]:
len(car_sales)

1000

In [11]:
X = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

converting data to number --- needed because models can't work with strings

In [13]:
one_hot = OneHotEncoder()

categorical_features = ["Make", "Colour", "Doors"]

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

In [14]:
transformed_X = pd.DataFrame(transformer.fit_transform(X))
transformed_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


Alternative way of converting to numerical data - Here using Pandas One Hot but price column remains same becausee it is numerical rest works the same

In [15]:
dummies = pd.get_dummies(car_sales[categorical_features])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.2)

In [17]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.2785696718627533

#### 1.2 Dealing with Missing values

1. Fill them with some value aka imputation
2. Remove samples with missing values all together

In [21]:
car_sales_missing_na = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [22]:
car_sales_missing_na.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [23]:
X = car_sales_missing_na.drop(["Price"], axis = 1)
y = car_sales_missing_na["Price"]

#### OneHotEncoder will take in NAN values but without preprocessing it, it will give undersied results and it is not recommended

In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
categorical_features = ["Make", "Colour", "Doors"]

tranformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

tranformed_X = transformer.fit_transform(X)
transformed_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [27]:
transformed_X.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

#### Recommended Method: Filling missing values or Drop then use OneHotEncoder

In [29]:
car_sales_missing_na

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [31]:
# Analyze the most common value
car_sales_missing_na["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

##### Option 1: Using pandas to fill misisng data

In [34]:
# fill make column 'missing' values
car_sales_missing_na["Make"].fillna("missing", inplace = True)

# fill color column with 'missing' value
car_sales_missing_na["Colour"].fillna("missing", inplace = True)

# fill doors column with the most common value i.e 4
car_sales_missing_na["Doors"].fillna(4, inplace = True)

In [36]:
car_sales_missing_na.isna().sum()

Make              0
Colour            0
Odometer (KM)    50
Doors             0
Price            50
dtype: int64

In [37]:
# fill odometer (KM) with the mean value of the entire dataset
car_sales_missing_na["Odometer (KM)"].fillna(car_sales_missing_na["Odometer (KM)"].mean(), inplace = True)

car_sales_missing_na.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [38]:
# Drop rows with missing value for our target feature i.e. 'Price'
car_sales_missing_na.dropna(inplace = True)

car_sales_missing_na.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [40]:
# Some data lost...
len(car_sales_missing_na)

950

In [45]:
X = car_sales_missing_na.drop(["Price"], axis = 1)
y = car_sales_missing_na["Price"]
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [49]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder(sparse_output = False)

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
transformed_X = transformer.fit_transform(X)
tranformed_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0


##### Option 2: using Sklearn to fill missing data

In [70]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [71]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [72]:
car_sales_missing.dropna(subset = ["Price"], inplace = True)

car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

##### One must spit X, y into train and test splits before filling missing values inorder to avoid data leaking from test to train splits and vise-versa 

In [73]:
from sklearn.model_selection import train_test_split

X = car_sales_missing.drop(["Price"], axis = 1)
y = car_sales_missing["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [74]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [75]:
# create specific imputer (value assigners) as per value to be assigned

cat_imputer = SimpleImputer(strategy = "constant", fill_value = "missing")
door_imputer = SimpleImputer(strategy = "constant", fill_value = 4)
num_imputer = SimpleImputer(strategy = "mean")

cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

transformer_na = ColumnTransformer([("cat_imputer", cat_imputer, cat_features),
                                  ("door_imputer", door_imputer, door_features), 
                                  ("num_imputer", num_imputer, num_features)], 
                                 remainder = "passthrough")

##### Transform the Train X and Test X seperately

In [76]:
X_train = transformer_na.fit_transform(X_train)
X_test = transformer_na.fit_transform(X_test)

In [80]:
X_train = pd.DataFrame(X_train, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])

In [81]:
X_test = pd.DataFrame(X_test, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])

In [82]:
# after filling values now convert our data to numerical format

from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse_output = False)

categorical_features = ["Make", "Colour", "Doors"]

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

X_train = transformer.fit_transform(X_train)
X_test = transformer.fit_transform(X_test)

In [83]:
from sklearn.ensemble import RandomForestRegressor

In [84]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.3756552782516739

##### Using Normalization to see how the model behaves

In [143]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.dropna(subset = ["Price"], inplace = True)

from sklearn.model_selection import train_test_split

X = car_sales_missing.drop(["Price"], axis = 1)
y = car_sales_missing["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create specific imputer (value assigners) as per value to be assigned

cat_imputer = SimpleImputer(strategy = "constant", fill_value = "missing")
door_imputer = SimpleImputer(strategy = "constant", fill_value = 4)
num_imputer = SimpleImputer(strategy = "mean")

cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

transformer_na = ColumnTransformer([("cat_imputer", cat_imputer, cat_features),
                                  ("door_imputer", door_imputer, door_features), 
                                  ("num_imputer", num_imputer, num_features)], 
                                 remainder = "passthrough")

X_train = transformer_na.fit_transform(X_train)
X_test = transformer_na.fit_transform(X_test)

X_train = pd.DataFrame(X_train, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])
X_test = pd.DataFrame(X_test, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])

X_train

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Nissan,Blue,4.0,63686.0
1,BMW,Green,5.0,162607.0
2,Toyota,White,4.0,197393.0
3,BMW,White,3.0,148949.0
4,Toyota,Green,4.0,109868.0
...,...,...,...,...
755,Toyota,White,4.0,176172.0
756,Toyota,Green,4.0,213861.0
757,Honda,White,4.0,52563.0
758,Toyota,Blue,4.0,207048.0


##### Feature Scaling

1. Normalization
2. Standarization

In [144]:
from sklearn.preprocessing import MinMaxScaler

In [145]:
X_train["Odometer (KM)"]

0       63686.0
1      162607.0
2      197393.0
3      148949.0
4      109868.0
         ...   
755    176172.0
756    213861.0
757     52563.0
758    207048.0
759    124924.0
Name: Odometer (KM), Length: 760, dtype: object

In [146]:
scaler_train = MinMaxScaler()
X_train["Odometer (KM)"] = scaler_train.fit_transform(np.array(X_train["Odometer (KM)"]).reshape(-1, 1))
X_train["Odometer (KM)"]

0      0.223343
1      0.636009
2      0.781125
3      0.579032
4      0.415999
         ...   
755    0.692598
756    0.849824
757    0.176941
758    0.821402
759    0.478808
Name: Odometer (KM), Length: 760, dtype: float64

In [147]:
X_test["Odometer (KM)"]

0            73869.0
1            76416.0
2            11119.0
3           231057.0
4           239760.0
           ...      
185    131148.681564
186         178351.0
187    131148.681564
188         121416.0
189          30080.0
Name: Odometer (KM), Length: 190, dtype: object

In [148]:
scaler_test = MinMaxScaler()
X_test["Odometer (KM)"] = scaler_test.fit_transform(np.array(X_test["Odometer (KM)"]).reshape(-1, 1))
X_test["Odometer (KM)"]

0      0.264413
1      0.275117
2      0.000698
3      0.925017
4      0.961592
         ...   
185    0.505138
186    0.703513
187    0.505138
188    0.464236
189    0.080384
Name: Odometer (KM), Length: 190, dtype: float64

In [149]:
one_hot = OneHotEncoder(sparse_output = False)

categorical_features = ["Make", "Colour", "Doors"]

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

X_train = transformer.fit_transform(X_train)
X_test = transformer.fit_transform(X_test)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.1718903586257302

### 2. Choosing the right estimator