In [177]:
import numpy as np
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt


In [178]:
heart_disease = pd.read_csv("dataset/heart-disease.csv")

In [179]:
heart_disease.head()

In [180]:
X = heart_disease.drop("target", axis=1)

In [181]:
y = heart_disease["target"]

In [182]:
X.head()

In [None]:
y.head()

In [145]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [146]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [147]:
heart_disease.shape

(303, 14)

## Make sure it is all numerical


In [148]:
car_sales = pd.read_csv('dataset/car-sales-extended.csv')

In [149]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [150]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           1000 non-null   object
 1   Colour         1000 non-null   object
 2   Odometer (KM)  1000 non-null   int64 
 3   Doors          1000 non-null   int64 
 4   Price          1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [151]:
len(car_sales)

1000

In [152]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [153]:
def feature_label(dataset, label):
    X = dataset.drop(label, axis=1)
    y = dataset[label]
    return X, y

    

In [154]:
# split into X, y
# X = car_sales.drop("Price", axis=1)
# y = car_sales['Price']

X, y = feature_label(car_sales, "Price")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)



In [155]:
car_sales['Doors'].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [156]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)

def turn_categories_to_numbers(categories, X):
    one_hot = OneHotEncoder()
    transformer = ColumnTransformer([("one_hot", 
                                     one_hot,
                                     categories)],
                                   remainder="passthrough")

    transformed_X = transformer.fit_transform(X)
    
    return transformed_X

In [157]:
pd.DataFrame(transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [158]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [159]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                    y,
                                                    test_size=0.2)


In [160]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.29444524256551574

### 1.2 What if there were missing values?

In [161]:
missng_car_sales = pd.read_csv("dataset/car-sales-extended-missing-data.csv")
missng_car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [162]:
missng_car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Option 1. Fill missing data with pandas

In [163]:
# Fill Make column
missng_car_sales['Make'].fillna('missing', inplace=True)

# Fill Colour column
missng_car_sales['Colour'].fillna('missng', inplace=True)

# Fill the Odometer (KM) column
missng_car_sales['Odometer (KM)'].fillna(missng_car_sales["Odometer (KM)"].mean(),
                                           inplace=True)

# Fill the Doors column 
missng_car_sales['Doors'].fillna(4, inplace=True)

In [164]:
missng_car_sales.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [165]:
# Remove the missing price value
missng_car_sales.dropna(inplace=True)


In [166]:
missng_car_sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [167]:
X, y = feature_label(missng_car_sales, "Price")

In [168]:
turn_categories_to_numbers(["Make", "Colour", "Doors"], X)

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

### Option 2. Fill missing values with Scikit-Learn 

In [169]:
car_sales_missing = pd.read_csv("dataset/car-sales-extended-missing-data.csv")

In [170]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [171]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace = True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [172]:
# Split into X and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]


In [173]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


cat_imputer = SimpleImputer(strategy="constant",
                                   fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", 
                            fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

cat_feautures = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ['Odometer (KM)']

imputer = ColumnTransformer([("cat_imputer", cat_imputer, cat_feautures),
                            ("num_imputer", num_imputer, num_features),
                            ("door_imputer", door_imputer, door_feature)], 
                           remainder="passthrough")

filled_X = imputer.fit_transform(X)
X = pd.DataFrame(filled_X, columns = ["Make", "Colour", "Odometer (KM)", "Doors"])

In [174]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [175]:
transformed_X = turn_categories_to_numbers(["Make", "Colour", "Doors"], X)

In [176]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.21020722035944994

## 2. Choosing the right estimator/algorithm for our problem
