In [1]:
from typing import TypeVar


T = TypeVar('T')

def pr(val : T , title:str|None = None , description: str|None = None) : # type: ignore
    if title != None :
        print(f" ---------------------- {title} ---------------------- ")
        if description != None :
            print(f" <-- {description} -->")
    print('type: ' , type(val))
    display(val)
    # return val


In [2]:
import numpy as np
import numpy.typing as npt
import pandas as pd
from typing import Any
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
what_we_are_covering = [
    "0. an end to end scikit learn workflow" , 
    "1. getting the data ready" , 
    "2. choose the right estimator/algorithm for our problem" , 
    "3. Fit the model/algorithm and use it to make predictions" ,
    "4. evaluate the model" , 
    "5. improve the model" ,
    "6. save and load a trained model" ,
    "7. putting all together"
]

# Scikit learn introduction

## 0- End to end workflow with scikit learn

In [4]:

dtypes :dict[str, Any] = {
    "age": np.int64,
    "sex": np.int8,
    "cp": np.int8,
    "trestbps": np.int64,
    "chol": np.int64,
    "fbs": np.int8,
    "restecg": np.int8,
    "thalach": np.int64,
    "exang": np.int8,
    "oldpeak": np.float64,
    "slope": np.int8,
    "ca": np.int64,
    "thal": np.int8,
    "target": np.int8,
}
heart_disease =  pd.read_csv('../data/heart-disease.csv' , dtype=dtypes ) # type: ignore
pr(heart_disease.head())
pr(heart_disease.dtypes)

type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


type:  <class 'pandas.core.series.Series'>


age           int64
sex            int8
cp             int8
trestbps      int64
chol          int64
fbs            int8
restecg        int8
thalach       int64
exang          int8
oldpeak     float64
slope          int8
ca            int64
thal           int8
target         int8
dtype: object

1- get the data ready

In [5]:
# create x (features matrix)
x = heart_disease.drop('target' , axis=1 )
pr(x.head())
# create y (label matrix)
y = heart_disease['target']
pr(y.head())

type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


type:  <class 'pandas.core.series.Series'>


0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int8

2- choose the right algorithm/estimator/model for your problem.

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
# we will keep the default hyper parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

3- fit the model to the training data

In [7]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2)
clf.fit(x_train , y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


3- make a prediction 


In [8]:
y_preds = clf.predict(X=x_test)
pr(y_preds)

type:  <class 'numpy.ndarray'>


array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int8)

4- Evaluate the model

In [9]:
clf.score(x_test , y_test)

0.7213114754098361

In [10]:
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
print(classification_report(y_test , y_preds))

              precision    recall  f1-score   support

           0       0.62      0.70      0.65        23
           1       0.80      0.74      0.77        38

    accuracy                           0.72        61
   macro avg       0.71      0.72      0.71        61
weighted avg       0.73      0.72      0.72        61



In [11]:
confusion_matrix(y_test , y_preds)

array([[16,  7],
       [10, 28]])

In [12]:
accuracy_score(y_test , y_preds)

0.7213114754098361

5- improve model

In [13]:
# try different amount of the estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying a model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train , y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test)*100}")
    print('-------------------------')

Trying a model with 10 estimators...
Model accuracy on test set: 75.40983606557377
-------------------------
Trying a model with 20 estimators...
Model accuracy on test set: 78.68852459016394
-------------------------
Trying a model with 30 estimators...
Model accuracy on test set: 70.49180327868852
-------------------------
Trying a model with 40 estimators...
Model accuracy on test set: 67.21311475409836
-------------------------
Trying a model with 50 estimators...
Model accuracy on test set: 73.77049180327869
-------------------------
Trying a model with 60 estimators...
Model accuracy on test set: 73.77049180327869
-------------------------
Trying a model with 70 estimators...
Model accuracy on test set: 75.40983606557377
-------------------------
Trying a model with 80 estimators...
Model accuracy on test set: 77.04918032786885
-------------------------
Trying a model with 90 estimators...
Model accuracy on test set: 70.49180327868852
-------------------------


6- save and model and load it

In [14]:
import pickle
import joblib
# pickle.dump(clf , open('../data/random_forest_model_1.pkl' , 'wb'))
joblib.dump(clf , open('../data/random_forest_model_1.pkl' , 'wb'))

In [15]:
loaded_model = joblib.load(open('../data/random_forest_model_1.pkl' , 'rb'))
loaded_model.score(x_test, y_test)

0.7049180327868853

# getting are data ready to be used with machine learning
There main things we have to do
- Split the data into features and labels usually (`x` and `y`)
- Filling (also called imputing) or discarding missing values
- converting non numerical values to numerical values (aka feature encoding)

In [28]:
from sklearn.model_selection import train_test_split
pr(heart_disease.sample() , 'heart disease data')
x = heart_disease.drop('target' , axis=1)
pr(x.sample() , 'x-train data')
y = heart_disease['target']
pr(y.sample() , 'y-train data')
pr('' , "let's split our data into training and test data sets")
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2)
pr(x_train.sample() , 'x_train')
pr(x_test.sample() , 'x_test')
pr(y_train.sample() , 'y_train')
pr(y_test.sample() , 'y_test')


 ---------------------- heart disease data ---------------------- 
type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
53,44,0,2,108,141,0,1,175,0,0.6,1,0,2,1


 ---------------------- x-train data ---------------------- 
type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
14,58,0,3,150,283,1,0,162,0,1.0,2,0,2


 ---------------------- y-train data ---------------------- 
type:  <class 'pandas.core.series.Series'>


107    1
Name: target, dtype: int8

 ---------------------- let's split our data into training and test data sets ---------------------- 
type:  <class 'str'>


''

 ---------------------- x_train ---------------------- 
type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3


 ---------------------- x_test ---------------------- 
type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
273,58,1,0,100,234,0,1,156,0,0.1,2,1,3


 ---------------------- y_train ---------------------- 
type:  <class 'pandas.core.series.Series'>


49    1
Name: target, dtype: int8

 ---------------------- y_test ---------------------- 
type:  <class 'pandas.core.series.Series'>


209    0
Name: target, dtype: int8