In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 0. An end to end Scikitlearn workflow

In [2]:
# 1. Get the data ready
import pandas as pd
heart_disease = pd.read_csv('heart_disease.csv')

In [3]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Create X (feature matrix)
X = heart_disease.drop('target', axis=1)

# Create Y (target vector)
y = heart_disease['target']

In [5]:
# 2 Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# Keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
# 3 Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
clf.fit(X_train, y_train)

This fitting process might give you some warning, becuase of for example that the default params are changing due to the different versions.
1. You can read the params by clf.get_params(), and search for keys that have warn values. If the parameter have warning value, you can give this paramater to the RandomForestClassifier with some value, and then warning goes away
2. Another way you can solve this is by following cell.
3. Another way to solve the warnings is to install the new packages and uninstall the old packages as the warnings say. This is the best choice

In [8]:
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings('default')  --> changes back to the default

In [9]:
import sklearn
sklearn.show_versions()


System:
    python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
executable: D:\anaconda3\python.exe
   machine: Windows-11-10.0.26100-SP0

Python dependencies:
      sklearn: 1.5.1
          pip: 24.2
   setuptools: 75.1.0
        numpy: 1.26.4
        scipy: 1.13.1
       Cython: None
       pandas: 2.2.2
   matplotlib: 3.9.2
       joblib: 1.4.2
threadpoolctl: 3.5.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: mkl
    num_threads: 4
         prefix: mkl_rt
       filepath: D:\anaconda3\Library\bin\mkl_rt.2.dll
        version: 2023.1-Product
threading_layer: intel

       user_api: openmp
   internal_api: openmp
    num_threads: 8
         prefix: vcomp
       filepath: D:\anaconda3\vcomp140.dll
        version: None


In [10]:
# 4 Make a prediction
y_preds = clf.predict(X_test)
y_preds

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1], dtype=int64)

In [11]:
# 5 Evaluate the model on the training data and test data
print(f'score on the training set {clf.score(X_train, y_train)}')
print(f'score on the testing set {clf.score(X_test, y_test)}')

score on the training set 1.0
score on the testing set 0.8032786885245902


In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.84      0.72      0.78        29
           1       0.78      0.88      0.82        32

    accuracy                           0.80        61
   macro avg       0.81      0.80      0.80        61
weighted avg       0.81      0.80      0.80        61



In [13]:
confusion_matrix(y_test, y_preds)

array([[21,  8],
       [ 4, 28]], dtype=int64)

In [14]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [15]:
# 6 Improve the model
# Try different amount of n_estimators
import numpy as np
np.random.seed(42)
for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%')
    print('')

Trying model with 10 estimators...
Model accuracy on test set: 72.13%

Trying model with 20 estimators...
Model accuracy on test set: 83.61%

Trying model with 30 estimators...
Model accuracy on test set: 81.97%

Trying model with 40 estimators...
Model accuracy on test set: 77.05%

Trying model with 50 estimators...
Model accuracy on test set: 78.69%

Trying model with 60 estimators...
Model accuracy on test set: 80.33%

Trying model with 70 estimators...
Model accuracy on test set: 80.33%

Trying model with 80 estimators...
Model accuracy on test set: 80.33%

Trying model with 90 estimators...
Model accuracy on test set: 77.05%



In [16]:
# 7 Save the model and load it
import pickle
pickle.dump(clf, open('random_forest_model_1.pkl', 'wb'))

In [17]:
# 8 Load the model
loaded_model = pickle.load(open('random_forest_model_1.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.7704918032786885

## 1. Getting our data ready to be used with machine learning model
Three main things we have to do:
1. Split the data into features and labels
2. Filling (also called imputing) or disregarding missing values
3. Converting non_numerical values to numerical values (also called feature encoding)

In [18]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [19]:
X = heart_disease.drop('target', axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [20]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.1 Make sure it's all numerical

In [23]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [24]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           1000 non-null   object
 1   Colour         1000 non-null   object
 2   Odometer (KM)  1000 non-null   int64 
 3   Doors          1000 non-null   int64 
 4   Price          1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [25]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [27]:
"""
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)
"""

'\nfrom sklearn.ensemble import RandomForestRegressor\nmodel = RandomForestRegressor()\nmodel.fit(X_train, y_train)\nmodel.score(X_test, y_test)\n'

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], 
                                remainder='passthrough')
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [29]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [30]:
# There is also a function in pandas that can one_hot_encode
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [31]:
# Let's refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model.fit(X_train, y_train)

NameError: name 'model' is not defined

In [None]:
model.score(X_test, y_test)

### What if there were missing datas?
1. Fill them with some values (imputation)
2. Remove the samples with the missing the altogether

In [None]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

#### Option 1: Fill the missing data with pandas

In [None]:
car_sales_missing['Doors'].value_counts()

In [None]:
car_sales_missing['Make'].fillna('missing', inplace=True)
car_sales_missing['Colour'].fillna('missing', inplace=True)
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
car_sales_missing['Doors'].fillna(4, inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Remove rows with missing Price values
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

#### Option 2: Fill missing values with Scikit-Learn

In [None]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')

In [None]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=['Price'], inplace=True)
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Fill categorical values with 'missing' and numerical values with mean
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

#Define columns
cat_feature = ['Make', 'Colour']
door_feature = ['Doors']
num_feature = ['Odometer (KM)']

#Create an imputer
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_feature),
                             ('door_imputer', door_imputer, door_feature),
                             ('num_imputer', num_imputer, num_feature)])

#Transform the data
filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X, columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

In [None]:
# Now let's fit the model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## 2. Choosing the right estimator/algorithm for our problem
Scikit-Learn uses estimator as a another term for machine learning model or algorithm

In [None]:
# try googling sk-learn ml map

### 2.1 Picking a machine learning model for a regression problem

In [None]:
# Import built-in Boston housing dataset
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
california

In [None]:
california_df = pd.DataFrame(california['data'], columns=california['feature_names'])
california_df['target'] = pd.Series(california['target'])
california_df.head()

In [None]:
len(california_df)

In [None]:
#Let's try the Ridge regression model that we found using the map of the sklearn machine learning
from sklearn.linear_model import Ridge

np.random.seed(42)

X = california_df.drop('target', axis=1)
y = california_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Ridge()
model.fit(X_train, y_train)
model.score(X_test, y_test)

How do we improve this score?

What if Ridge wasn't working?

Let's refer back to the map... https://scikit-learn.org/stable/machine_learning_map.html

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = california_df.drop('target', axis=1)
y = california_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

Just switching model, we improved our score

### 2.2 Choosing an estimator for a classification problem
Let's go back to the map, and try some models

In [None]:
heart_disease = pd.read_csv('heart_disease.csv')
heart_disease.head()

In [None]:
len(heart_disease)

In [None]:
# Import LinearSVC estimator class
from sklearn.svm import LinearSVC

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Pretty good, but let's try go farther and try some estimators

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

Still performs pretty well

* If you have structured data = prefer ensemble methods
* If you have unstructured data = use deep learning or transfer learning methods

## 3. Fit our model/algorithm on our data and use it to make predictions

In [None]:
rfc.fit(X_train, y_train) #it does the work of fitting

### 3.2 Make predictions based on the trained model
2 ways to make predictions:
1. `predict()`
2. `predict_proba()`

In [None]:
rfc.predict(X_test)

In [None]:
np.array([y_test])

Make predictions with `predict_proba()`

In [None]:
# predict_proba() returns probabilities of a classification label
rfc.predict_proba(X_test[:5])

In [None]:
#Let's predict on the same data
rfc.predict(X_test[:5])

Predicting on regression model

In [None]:
california_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = california_df.drop('target', axis=1)
y = california_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor().fit(X_train, y_train)

y_preds = model.predict(X_test)
y_preds[:10]

In [None]:
np.array([y_test[:10]])

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

## 4. Evaluating a model
Three ways to evaluate ScikitLearn models/estimators:
1. Estimator `score` method
2. The `scoring` parameter
3. Problem-specific metric functions

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf=RandomForestClassifier().fit(X_train, y_train)

#### Evaluating model with score method

In [None]:
clf.score(X_test, y_test)

Scoring on regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = california_df.drop('target', axis=1)
y = california_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test) #calculates the score in a different way with default method

#### Evaluating model with a scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf=RandomForestClassifier().fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
cross_val_score(clf, X, y)

<img src='cross_valid.png'/>

In [None]:
cross_val_score(clf, X, y, cv=10)

In [None]:
np.mean(cross_val_score(clf, X, y))

In [None]:
# Default scoring parameter of classifier = mean accuracy

In [None]:
#Scoring parameter is set to none by default
cross_val_score(clf, X, y, scoring=None)

Cross validation is more accurate than the score method

#### Classification model evaluation metrics
1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

**Accuracy**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf=RandomForestClassifier()
cross_val = cross_val_score(clf, X, y)

In [None]:
print(f'Cross_validate accuracy: {np.mean(cross_val) * 100:.02f}')

**Area under the reciever operating characteristic curve (AUC/ROC)**  
ROC curves are comparison of a model's true positive rate (tpr) versus model's false positive rate (fpr)
* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)

y_probs = clf.predict_proba(X_test)
y_probs[:10]

In [None]:
y_probs_positive = y_probs[:, 1]

In [None]:
# Calculate fpr, tpr and threshold
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

In [None]:
fpr

In [None]:
# Create a function for plotting ROC curve
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr)
    and true positive rate (tpr) of a model.
    """
    #Plot roc curve
    plt.plot(fpr, tpr, color='orange', label='ROC')
    #Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Guessing')

    #Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel('True positive rate (tpr)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
#Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score
roc_auc_score(y_test, y_test)

**Confusion Matrix**  
A Confusion Matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict.  
In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)
confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()
pd.crosstab(y_test, y_preds, rownames=['Actual labels'], colnames=['Predicted labels'])

In [None]:
# Make our confusion matrix more visual with Seaborn's heatmap()
# Seaborn is visualization library built upon matplotlib
import seaborn as sns

# set the font_scale
sns.set(font_scale=1.5)

conf_mat = confusion_matrix(y_test, y_preds)

sns.heatmap(conf_mat)
plt.show()

In [None]:
def plot_conf_mat(conf_mat):
    """
    Plots confusion matrix using seaborn's heatmap()
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(conf_mat, 
                     annot=True, # annotate the boxes with conf_mat info
                    cbar=False)
    plt.xlabel('True label')
    plt.ylabel('Predicted label')
    plt.show()

plot_conf_mat(conf_mat)

**Classification Report**

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall metrics become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0

pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True))

To summarize classification metrics:
* `Accuracy` is a good measure to start with if all classes are balanced (e.g., same amount of sample which are labeled with 0 or 1)
* `Precision` and `recall` become more important when classes are imbalanced

### Regression model evaluation metrics
1. R^2 (r_squared) or coefficient of determination.
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = california_df.drop('target', axis=1)
y = california_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

**R^2**

In [None]:
rf.score(X_test, y_test)

**Mean absolute error (MAE)**

In [None]:
from sklearn.metrics import mean_absolute_error

y_preds = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_preds)
mae

**Mean squared error**

In [None]:
# Mean squared error
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_preds)
mse

### Using the scoring parameter for classification

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

heart_disease = pd.read_csv('heart_disease.csv')
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']
clf = RandomForestClassifier()

**scoring parameter 'accuracy'**

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, scoring=None)
np.mean(cv_acc)

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, scoring='accuracy')
np.mean(cv_acc)

**'precision'**

In [None]:
cv_precision = cross_val_score(clf, X, y, scoring='precision')
np.mean(cv_precision)

**'recall'**

In [None]:
cv_recall = cross_val_score(clf, X, y, scoring='recall')
np.mean(cv_recall)

**'f1'**

In [None]:
cv_f1 = cross_val_score(clf, X, y, scoring='f1')
np.mean(cv_f1)

### Using scoring method for Regression model

In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing

california_df = fetch_california_housing()
california = pd.DataFrame(california_df['data'], columns=california_df['feature_names'])
california['target'] = pd.Series(california_df['target'])

X = california.drop('target', axis=1)
y = california['target']

model = RandomForestRegressor()

In [39]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, scoring=None)
cv_r2

array([0.51688816, 0.70280719, 0.74200859, 0.61659773, 0.68247339])

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, scoring='r2')
cv_r2

In [None]:
cv_mae = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
cv_mae

In [None]:
cv_mse = cross_val_score(model, X, y, scoring='neg_mean_squared_error')
cv_mse

### 4.3 Using different evaluation metrics as Scikit-Learn functions
**Classification evaluation functions**

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make some predictions
y_preds = clf.predict(X_test)

# Evaluate the classifier
print('Classifier metrics on the test set')
print(f'Accuracy: {accuracy_score(y_test, y_preds)*100:.2f}%')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'F1: {f1_score(y_test, y_preds)}')

Classifier metrics on the test set
Accuracy: 85.25%
Precision: 0.8484848484848485
Recall: 0.875
F1: 0.8615384615384616


**Regression evaluation functions**

In [41]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

X = california.drop('target', axis=1)
y = california['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

print('Regression model metrics on the test set')
print(f'R2: {r2_score(y_test, y_preds)}')
print(f'MAE: {mean_absolute_error(y_test, y_preds)}')
print(f'MSE: {mean_squared_error(y_test, y_preds)}')

Regression model metrics on the test set
R2: 0.8094425973856642
MAE: 0.3268376173691862
MSE: 0.260092917795195


## 5. Improving a model  
First predictions = baseline predictions  
First model = baseline model  

From a data perspective:
* Could we collect more data?
* Could we improve our data?

From a model perspective:
* Is there a better model we could use?
* Could we improve the current model?

Parameters = model find these patterns in data  
Hyperparameters = settings on a model you can adjust

In [None]:
# Finding hyperparameters
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.get_params()

**Three ways to adjust hyperparameters:**
1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV

### 5.1 Tuning hyperparameters by hand

<img src='tuning_parameters.png'/>

In [None]:
clf.get_params()

**We're going to try and adjust**
1. max_depth
2. max_features
3. min_samples_leaf
4. min_samples_split
5. n_estimators

if you're unsure about these parameters, you can always refer to its documentation

In [33]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_preds labels on a classification model
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {'accuracy': round(accuracy, 2),
                  'precision': round(precision, 2),
                  'recall': round(recall, 2),
                  'f1_score': round(f1, 2)}
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1_score: {f1:.2f}')
    return metric_dict

In [34]:
# Let's split the data into 3 sets
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']
X_train, tmp_x, y_train, tmp_y = train_test_split(X, y, test_size=0.3)
X_valid, X_test, y_valid, y_test = train_test_split(tmp_x, tmp_y, test_size=0.5)
len(X_train), len(X_valid), len(X_test)

(212, 45, 46)

In [44]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

#Make baseline predictions
y_preds = clf.predict(X_valid)

#evaluate the classifier on the validation set
baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

Accuracy: 97.78%
Precision: 1.00
Recall: 0.96
F1_score: 0.98


{'accuracy': 0.98, 'precision': 1.0, 'recall': 0.96, 'f1_score': 0.98}

In [None]:
np.random.seed(42)

#Create a second classifier with different hyperparameters
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)
y_preds_2 = clf_2.predict(X_valid)
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)
clf_2_metrics

Oops! We made it worse

Just like the above we can manually set the hyperparameters and check it. But this is tedious, and scikit-learn have built-in function that simplifies this process

### 5.2 Hyperparameter tuning with RandomizedSearchCV

In [45]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
        'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 4]}

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier(n_jobs=1) #n_jobs parameter is how much computer process you want to dedicate to the classifier

#Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, #number of models to try
                            cv=5, #no need to create cross-validation set by giving this parameter
                            verbose=2)

#Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   3.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   3.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   4.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.8s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100

In [None]:
rs_clf.best_params_

In [None]:
# Make predictions with the best hyperparameters
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

### 5.3 Hyperparameter tuning with GridSearchCV

The main difference between RandomizedSearchCv and GridSearchCV is that GridSeachCV will go over every single unique combinations of the hyperparameter options

In [46]:
# Since it goes over every single unique combinations we will minimize the size of the grid to save some time and computation
grid_2 = {'n_estimators': [10, 100, 200],
        'max_depth': [5],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split': [4],
        'min_samples_leaf': [1, 2]}

from sklearn.model_selection import GridSearchCV

np.random.seed(42)

#Setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                            param_grid=grid_2,
                            cv=5, #no need to create cross-validation set by giving this parameter
                            verbose=2)

#Fit the GridSearchCV version of clf
gs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=

In [None]:
gs_clf.best_params_

In [None]:
gs_y_preds = gs_clf.predict(X_test)

gs_metrics = evaluate_preds(y_test, gs_y_preds)

It actually got better

Let's compare our different model metrics

In [None]:
compare_metrics = pd.DataFrame({'baseline': baseline_metrics,
                                'clf_2': clf_2_metrics,
                                'random search': rs_metrics,
                                'grid search': gs_metrics})
compare_metrics

In [None]:
compare_metrics.plot.bar(figsize=(10, 8))
plt.show()

## 6. Saving and loading trained machine learning model.
Two ways:
1. With Python's `pickle` module
2. With the `joblib` module

**Pickle**

In [47]:
import pickle

# Save an existing model to a file
pickle.dump(gs_clf, open('gs_random_forest_model_1.pkl', 'wb')) 

In [48]:
#Load a saved model
load_pickle_model = pickle.load(open('gs_random_forest_model_1.pkl', 'rb'))

In [50]:
#Make some predictions
pickle_y_preds = load_pickle_model.predict(X_test)

In [51]:
evaluate_preds(y_test, pickle_y_preds)

Accuracy: 86.89%
Precision: 0.90
Recall: 0.84
F1_score: 0.87


{'accuracy': 0.87, 'precision': 0.9, 'recall': 0.84, 'f1_score': 0.87}

**Joblib**

In [54]:
from joblib import dump, load

#Save model to file
dump(gs_clf, filename='gs_random_forest_model_1.joblib')

['gs_random_forest_model_1.joblib']

In [55]:
#Import saved joblib model
loaded_job_model = load(filename='gs_random_forest_model_1.joblib')

In [56]:
#Make predictions
joblib_y_preds = loaded_job_model.predict(X_test)

In [57]:
evaluate_preds(y_test, joblib_y_preds)

Accuracy: 86.89%
Precision: 0.90
Recall: 0.84
F1_score: 0.87


{'accuracy': 0.87, 'precision': 0.9, 'recall': 0.84, 'f1_score': 0.87}

if your model is large, prefer joblib over pickle

## 7. Putting it all together

In [59]:
data = pd.read_csv('car-sales-extended-missing-data.csv')
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [61]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [62]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

Steps we want to do (all in one cell):
1. Fill missing data
2. Convert data to numbers
3. Build a model on the data

In [63]:
#Getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#Setup random seed
import numpy as np
np.random.seed(42)

#Import data and drop rows with missing labels
data = pd.read_csv('car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'], inplace=True)

#Define different features and transformer pipeline
categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))])
door_feature = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))])
numeric_features = ['Odometer (KM)']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

#Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('door', door_transformer, door_feature),
    ('num', numeric_transformer, numeric_features)])

#Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())])

#Split data
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

In [66]:
# Use GridSearchCV with our regression pipeline
pipe_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['sqrt'],
    'model__min_samples_split': [2, 4]}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_sampl

In [67]:
gs_model.score(X_test, y_test)

0.2848784564026805

Yes, we improved the model a little bit

In [69]:
from joblib import dump, load
dump(gs_model, filename='gs_model_trained_2.joblib')

['gs_model_trained_2.joblib']