# Introduction to SciKit-Learn


What is covered:

1. Scikit-learn Workflow
2. GEtting the Data ready
3. Choosing the right model/algorithm
4. Fitting the Data to the model
5. Evaluating the Model
6. Improving the Model


## 1. Scikit-learn Workflow

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Getting the Data

In [3]:

heart_disease = pd.read_csv('./sample_data/heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
 # --> Creating X (features Column)  and  Y(labels) 

X = heart_disease.drop("target",axis=1)

y = heart_disease["target"] 

## 3. Selecting the right model

In [5]:
# Using the random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
clf = RandomForestClassifier()

# We will keep the default parameters
clf.get_params()  # These are hyper parameters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## 4. Fitting the Data

In [7]:
from sklearn.model_selection import train_test_split


In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)  # test_size is 20%

In [9]:
clf.fit(X_train,y_train)

In [10]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
262,53,1,0,123,282,0,1,95,1,2.0,1,2,3
250,51,1,0,140,298,0,1,122,1,4.2,1,3,3
126,47,1,0,112,204,0,1,143,0,0.1,2,0,2
181,65,0,0,150,225,0,0,114,0,1.0,1,3,3
143,67,0,0,106,223,0,1,142,0,0.3,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,58,0,2,120,340,0,1,172,0,0.0,2,0,2
40,51,0,2,140,308,0,0,142,0,1.5,2,1,2
80,41,1,2,112,250,0,1,179,0,0.0,2,0,2
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3


In [11]:
# make a prediction
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1], dtype=int64)

## 5. Evaluating the Data

In [12]:
clf.score(X_train,y_train)  #it will work perfect 

1.0

In [13]:
clf.score(X_test,y_test)

0.8524590163934426

In [14]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 

In [15]:
 # --> Classification report
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.96      0.75      0.84        32
           1       0.78      0.97      0.86        29

    accuracy                           0.85        61
   macro avg       0.87      0.86      0.85        61
weighted avg       0.87      0.85      0.85        61



In [16]:
 # --> Confusion matrix
print(confusion_matrix(y_true=y_test,y_pred=y_pred))

[[24  8]
 [ 1 28]]


### 5. Improve the model

In [20]:
# Trying with different hyperparameters
best_n_estimator = 0
current_best_score = 0
for i in range(10,100,1):
    print(f"Trying thr model with {i}th estimator")
    clf = RandomForestClassifier(
        n_estimators=i
    )
    clf.fit(X_train,y_train)
    print(f"The Accuracy = {clf.score(X_test,y_test)}")
    if(clf.score(X_test,y_test) > current_best_score):
        current_best_score = clf.score(X_test,y_test)
        best_n_estimator = i

print(f"Best n_estimator is {best_n_estimator} with {current_best_score*100}% accuracy")

Trying thr model with 10th estimator
The Accuracy = 0.8688524590163934
Trying thr model with 11th estimator
The Accuracy = 0.819672131147541
Trying thr model with 12th estimator
The Accuracy = 0.7868852459016393
Trying thr model with 13th estimator
The Accuracy = 0.819672131147541
Trying thr model with 14th estimator
The Accuracy = 0.819672131147541
Trying thr model with 15th estimator
The Accuracy = 0.8360655737704918
Trying thr model with 16th estimator
The Accuracy = 0.819672131147541
Trying thr model with 17th estimator
The Accuracy = 0.8688524590163934
Trying thr model with 18th estimator
The Accuracy = 0.8032786885245902
Trying thr model with 19th estimator
The Accuracy = 0.819672131147541
Trying thr model with 20th estimator
The Accuracy = 0.8360655737704918
Trying thr model with 21th estimator
The Accuracy = 0.8524590163934426
Trying thr model with 22th estimator
The Accuracy = 0.8524590163934426
Trying thr model with 23th estimator
The Accuracy = 0.819672131147541
Trying thr m

### 6. Saving the model

In [21]:
import pickle

pickle.dump(clf,open("first_model_random_forest.pkl","wb"))

In [22]:
#  load the model

loaded_model = pickle.load(open("./first_model_random_forest.pkl","rb"))

loaded_model.score(X_test,y_test)

0.819672131147541