In [5]:
#Prepare data 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [2]:
steps = ['1. Getting the data ready',
        '2. Choose the right estimator',
        '3. Fit the model/algorithm and use it to make predictions',
        '4. Evaluate model', 
        '5. Improve model',
        '6. Save and load the trained model',
        '7. Put it all together']

In [3]:
heart_disease = pd.read_csv('data\heart-disease.csv')

In [4]:
heart_disease # we want to predict target 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
#create X (features matrix) everything except the target 
X = heart_disease.drop('target', axis=1) 

#create Y (labels)
y = heart_disease['target']


In [6]:
#Choose the right model and hyper parameters 
from sklearn.ensemble import RandomForestClassifier 
clf = RandomForestClassifier() 

#we will keep the default hyperparameters 
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
#fit the model to the training data
from sklearn.model_selection import train_test_split  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  
#80% of data will be used for training, 20% for testing

In [8]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
66,51,1,2,100,222,0,1,143,1,1.2,1,0,2
184,50,1,0,150,243,0,0,128,0,2.6,1,0,3
209,59,1,0,140,177,0,1,162,1,0.0,2,1,3
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2
267,49,1,2,118,149,0,0,126,0,0.8,2,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
207,60,0,0,150,258,0,0,157,0,2.6,1,2,3
54,63,0,2,135,252,0,0,172,0,0.0,2,0,2
45,52,1,1,120,325,0,1,172,0,0.2,2,0,2


In [9]:
clf.fit(X_train, y_train);

In [10]:
y_preds = clf.predict(X_test) 
y_preds

array([1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=int64)

In [11]:
# 4. Evaluate the Model on the training data and test data
clf.score(X_train, y_train) #returns the mean accuracy on the givent test data and labels

1.0

In [12]:
clf.score(X_test, y_test)

0.7377049180327869

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.61      0.70        31
           1       0.68      0.87      0.76        30

    accuracy                           0.74        61
   macro avg       0.76      0.74      0.73        61
weighted avg       0.76      0.74      0.73        61



In [14]:
accuracy_score(y_test, y_preds)

0.7377049180327869

In [15]:
confusion_matrix(y_test, y_preds)

array([[19, 12],
       [ 4, 26]], dtype=int64)

In [16]:
# 5. Improve a model 
#Try different amount of n_estimators 
np.random.seed(42) 
for i in range(10, 100, 10): 
    print(f"Trying model with {i} estimators...") 
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%') 
    print('')

Trying model with 10 estimators...
Model accuracy on test set: 68.85%

Trying model with 20 estimators...
Model accuracy on test set: 78.69%

Trying model with 30 estimators...
Model accuracy on test set: 75.41%

Trying model with 40 estimators...
Model accuracy on test set: 70.49%

Trying model with 50 estimators...
Model accuracy on test set: 73.77%

Trying model with 60 estimators...
Model accuracy on test set: 72.13%

Trying model with 70 estimators...
Model accuracy on test set: 73.77%

Trying model with 80 estimators...
Model accuracy on test set: 72.13%

Trying model with 90 estimators...
Model accuracy on test set: 72.13%



We can see from the previous cell that the best model is the one with 20 estimators

In [17]:
#6. save the model and load it 
import pickle 

pickle.dump(clf, open("random_forest_model_1.pk1", "wb"))

In [18]:
loaded_model = pickle.load(open('random_forest_model_1.pk1','rb')) 
loaded_model.score(X_test, y_test)

0.7213114754098361

# Splitting Data

**Three main things we have to do:** 
   1. Split the data into features and labels (usually 'x' & 'y') 
   2. Filling (also called imputing) or disregarding missing values 
   3. Converting non-numerical values to numerical values (also called feature encoding)

In [19]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [20]:
X = heart_disease.drop('target',axis=1)

In [21]:
y = heart_disease['target'] 
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

### Split the data into training and test sets

In [22]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [24]:
X.shape[0] * .8

242.4

In [25]:
242 + 61

303

# Converting Data to Numbers 

In [26]:
car_sales = pd.read_csv('data\car-sales-extended.csv')

In [27]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [28]:
car_sales.count()

Make             1000
Colour           1000
Odometer (KM)    1000
Doors            1000
Price            1000
dtype: int64

In [29]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [30]:
#split data 
X = car_sales.drop('Price', axis=1) 
y = car_sales['Price'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
#turn categories into numbers 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer  

categorical_features = ['Make','Colour','Doors'] 
#Doors are categorical even though they are numerical 
one_hot = OneHotEncoder() 
transformer = ColumnTransformer([('one_hot',
                                   one_hot, 
                                   categorical_features)],
                                   remainder='passthrough') 
transformed_x = transformer.fit_transform(X) 
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [32]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [33]:
dummies = pd.get_dummies(car_sales[['Make','Colour','Doors']]) 
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [35]:
np.random.seed(42) 
X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size = 0.2) 
model = RandomForestRegressor() 
model.fit(X_train, y_train) 

RandomForestRegressor()

In [36]:
model.score(X_test, y_test)

0.3235867221569877

# Handling Missing Values 
the current version of OneHotEncoder() can handle missing values. But always good to know

1. Fill them with some value (also known as imputation) 
2. Remove the samples with missing data altogether

In [37]:
#import car sales missing data 
car_sales_missing = pd.read_csv('data\car-sales-extended-missing-data.csv') 
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [38]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [39]:
# create x and y 
X = car_sales_missing.drop('Price', axis=1) 
y = car_sales_missing['Price']

In [40]:
#convert data to numbers 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer  

categorical_features = ['Make','Colour','Doors'] 
#Doors are categorical even though they are numerical 
one_hot = OneHotEncoder() 
transformer = ColumnTransformer([('one_hot',
                                   one_hot, 
                                   categorical_features)],
                                   remainder='passthrough') 
transformed_x = transformer.fit_transform(X) 
transformed_x

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [41]:
car_sales_missing['Doors'].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

#### option 1: fill missing data with pandas

In [42]:
#fill the make column 
car_sales_missing['Make'].fillna('missing', inplace=True) 

#fill the color column 
car_sales_missing['Colour'].fillna('missing',inplace=True) 

#fill odometer column 
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True) 

#fill the Doors columns 
car_sales_missing['Doors'].fillna(4, inplace=True)

In [43]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [44]:
#Remove rows with missing price value 
car_sales_missing.dropna(inplace=True)

In [45]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [46]:
len(car_sales_missing) 

950

In [47]:
X = car_sales_missing.drop('Price',axis=1) 
y = car_sales_missing['Price']

In [48]:
#convert data to numbers 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer  

categorical_features = ['Make','Colour','Doors'] 
#Doors are categorical even though they are numerical 
one_hot = OneHotEncoder() 
transformer = ColumnTransformer([('one_hot',
                                   one_hot, 
                                   categorical_features)],
                                   remainder='passthrough') 
transformed_x = transformer.fit_transform(car_sales_missing) 
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

#### option 2: Fill missing values with sklearn

In [49]:
car_sales_missing = pd.read_csv('data\car-sales-extended-missing-data.csv')

In [50]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [51]:
car_sales_missing.dropna(subset=['Price'], inplace=True) 
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [52]:
#split into X & y 
X = car_sales_missing.drop('Price', axis=1) 
y = car_sales_missing['Price'] 

In [56]:
# Fill missing values with sklearn
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 

#fill categorical values with 'missing' and numerical values with mean 
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing') 
door_imputer = SimpleImputer(strategy='constant', fill_value=4) 
num_imputer = SimpleImputer(strategy='mean') 

#define columns 
cat_features = ['Make','Colour'] 
door_features = ['Doors'] 
num_features = ['Odometer (KM)'] 

#Create an imputer (something that fills missing data)
imputer = ColumnTransformer([ 
    ('cat_imputer', cat_imputer, cat_features), 
    ('door_imputer', door_imputer, door_features), 
    ('num_imputer', num_imputer, num_features)
]) 

#tranform the data 
filled_X = imputer.fit_transform(X) 
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [60]:
car_sales_filled = pd.DataFrame(filled_X, columns=['Make','Colour','Doors','Odometer (KM)']) 
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


In [59]:
car_sales_filled.isna().sum()

make        0
color       0
doors       0
odometer    0
dtype: int64

In [62]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

categorical_features = ['Make','Colour','Doors'] 
one_hot = OneHotEncoder() 
transformer = ColumnTransformer([('one_hot', 
                                   one_hot,
                                   categorical_features)], 
                                   remainder='passthrough')
transformed_X = transformer.fit_transform(car_sales_filled) 
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [66]:
#Now we have got our data as numbers and filled (no missing values) 
#lets fir a model 
np.random.seed(42) 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(transformed_X, 
                                                    y, 
                                                    test_size=0.2) 
model = RandomForestRegressor() 
model.fit(X_train, y_train) 
model.score(X_test,y_test)                                                   

0.21990196728583944

# Choose the right estimator/algorithm for the problem 

some things to note 

* SKlearn refers to machine learning models, algorithms as estimators 
* Classification problem - predicting a category (one thing or the other) 
* something you will see 'clf' (short for classifier) used as a classification estimator 
* Regression problem - predicting a number (selling price of a car) 
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html 

### 2.1 picking a machine learning model for a regression problem 

In [1]:
#get cali housing dataset 
from sklearn.datasets import fetch_california_housing 

In [3]:
housing = fetch_california_housing() 
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [9]:
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names']) 
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32
