## A simple scikit-LEarn Classification workflow:
1. Getting the data ready
2. Picking a model
3. Fitting the model to the data and making prediction
4. Evaluating
5. Experimenting/tuning
6. Saving

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## 1. Getting the data ready 

In [2]:
hd = pd.read_csv("heart_disease.csv")

In [3]:
hd

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
x = hd.drop("target",axis =1)
y = hd["target"]

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((227, 13), (76, 13), (227,), (76,))

## 2. Choosing the estimator
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html


In [6]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

## 3. Fitting the model to the data and using it to make a model



In [8]:
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
# Make predictions
y_preds = model.predict(x_test)

In [10]:
y_preds

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [12]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
11,48,0,2,130,275,0,1,139,0,0.2,2,0,2
161,55,0,1,132,342,0,1,166,0,1.2,2,0,2
46,44,1,2,140,235,0,0,180,0,0.0,2,0,2
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3
199,65,1,0,110,248,0,0,158,0,0.6,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,35,1,0,126,282,0,0,156,1,0.0,2,0,3
76,51,1,2,125,245,1,0,166,0,2.4,1,0,2
122,41,0,2,112,268,0,0,172,1,0.0,2,0,2
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2


In [16]:
# Make a prediction on a single sample (has to be array)
model.predict(np.array(x_test.loc[76]).reshape(1, -1))

array([1], dtype=int64)

In [18]:
# On the training set
model.score(x_train, y_train)

1.0

In [20]:
# On the test set (unseen)
model.score(x_test, y_test)

0.7894736842105263

In [22]:
## 5. Experimenting
np.random.seed(42)
for i in range(10,50,1):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {model.score(x_test, y_test)}")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 0.8421052631578947

Trying model with 11 estimators...
Model accuracy on test set: 0.7631578947368421

Trying model with 12 estimators...
Model accuracy on test set: 0.7894736842105263

Trying model with 13 estimators...
Model accuracy on test set: 0.8026315789473685

Trying model with 14 estimators...
Model accuracy on test set: 0.8026315789473685

Trying model with 15 estimators...
Model accuracy on test set: 0.8157894736842105

Trying model with 16 estimators...
Model accuracy on test set: 0.8026315789473685

Trying model with 17 estimators...
Model accuracy on test set: 0.7631578947368421

Trying model with 18 estimators...
Model accuracy on test set: 0.8157894736842105

Trying model with 19 estimators...
Model accuracy on test set: 0.8157894736842105

Trying model with 20 estimators...
Model accuracy on test set: 0.7894736842105263

Trying model with 21 estimators...
Model accuracy on test set: 0.7894736842105263

Tryi

In [28]:
# tryng different number of estimators with cross-validation and no cross-validation
from sklearn.model_selection import cross_val_score
np.random.seed(42)
for i in range(10, 50, 1):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(x_train,y_train)
    print(f"model accuracy on test set: {model.score(x_test,y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, x, y,cv =5)) * 100} %")
    print("")

Trying model with 10 estimators...
model accuracy on test set: 0.8421052631578947
Cross-validation score: 78.53551912568305 %

Trying model with 11 estimators...
model accuracy on test set: 0.8421052631578947
Cross-validation score: 81.8415300546448 %

Trying model with 12 estimators...
model accuracy on test set: 0.8421052631578947
Cross-validation score: 79.85792349726776 %

Trying model with 13 estimators...
model accuracy on test set: 0.8026315789473685
Cross-validation score: 81.16939890710381 %

Trying model with 14 estimators...
model accuracy on test set: 0.8289473684210527
Cross-validation score: 80.18579234972677 %

Trying model with 15 estimators...
model accuracy on test set: 0.8552631578947368
Cross-validation score: 81.8415300546448 %

Trying model with 16 estimators...
model accuracy on test set: 0.8026315789473685
Cross-validation score: 82.1584699453552 %

Trying model with 17 estimators...
model accuracy on test set: 0.7368421052631579
Cross-validation score: 78.87431

In [1]:
## 4.2 Evaluating a model using the scoring paramaeter


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
np.random.seed(42)
heart_disease= pd.read_csv("heart_disease.csv")
x = heart_disease.drop("target",axis =1)
y = heart_disease["target"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
clf = RandomForestClassifier()
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
clf.score(x_test,y_test)

0.8524590163934426

In [7]:
cross_val_score(clf,x,y)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])

In [9]:
np.random.seed(42)
# Single training and test split score
clf_single_score = clf.score(x_test,y_test)
# Take the mean of 5-fold cross-validation score
clf_cross_val_score = np.mean(cross_val_score(clf,x,y,cv=5))
clf_single_score, clf_cross_val_score

(0.8524590163934426, 0.8248087431693989)

##  4.2.1 Classification model evaluation metrics 


1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators =100)
cross_val_score= cross_val_score(clf,x,y,cv=5)


In [16]:
np.mean(cross_val_score)

0.8248087431693989

In [19]:
print(f"Heart Disease Classifier Cross-Validated Accuray: {np.mean(cross_val_score)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuray: 82.48%
