# Introduction to Scikit-Learn (sklearn)
This notebook demonstrates some of the most useful functions of the Scikit-Learn library. 
Scope of the excercise:

0. An end-to-end scikit-Learn workflow. 
1. Getting the data ready.
2. Choose the right model/algorithm for our problem.
3. Fit the model/algorithm and use it to make predictions on our data.
4. Evaluating the model.
5. Improve the model.
6. Save and load the train model.
7. Putting it all together.

In [80]:
# 1.Get the data ready 
import pandas as pd
import numpy as np
heart_disease=pd.read_csv("original.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
#Create Features Matrix
X=heart_disease.drop("target",axis=1)
#Create Labels
y=heart_disease["target"]

In [6]:
# 2. Choose the right model and hyperparameters (Problem is classification)
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
#We will keep the default hyperparameters 
clf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [8]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
#make a prediction
import numpy as np 

y_preds=clf.predict(X_test)
y_preds

array([0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0], dtype=int64)

In [10]:
y_test

300    0
27     1
222    0
171    0
116    1
      ..
158    1
302    0
32     1
252    0
232    0
Name: target, Length: 61, dtype: int64

In [11]:
# 4. Evaulate the model 
clf.score(X_test,y_test)

0.7704918032786885

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.86      0.71      0.77        34
           1       0.70      0.85      0.77        27

    accuracy                           0.77        61
   macro avg       0.78      0.78      0.77        61
weighted avg       0.79      0.77      0.77        61



In [13]:
confusion_matrix(y_test,y_preds)

array([[24, 10],
       [ 4, 23]], dtype=int64)

In [14]:
accuracy_score(y_test,y_preds)

0.7704918032786885

In [22]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators..")
    clf=RandomForestClassifier(n_estimators=i)
    clf.fit(X_train,y_train)
    print(f"Model accuracy on test set: {(clf.score(X_test,y_test))*100:.2f}%")
    print("")
    
    

Trying model with 10 estimators..
Model accuracy on test set: 86.89%

Trying model with 20 estimators..
Model accuracy on test set: 86.89%

Trying model with 30 estimators..
Model accuracy on test set: 88.52%

Trying model with 40 estimators..
Model accuracy on test set: 83.61%

Trying model with 50 estimators..
Model accuracy on test set: 85.25%

Trying model with 60 estimators..
Model accuracy on test set: 83.61%

Trying model with 70 estimators..
Model accuracy on test set: 88.52%

Trying model with 80 estimators..
Model accuracy on test set: 81.97%

Trying model with 90 estimators..
Model accuracy on test set: 85.25%



In [25]:
# 6. Save the model and load it 

import pickle 
#"wb" mean write binary file
pickle.dump(clf, open("random_forest_model1.pkl","wb"))

In [38]:
#"rb" mean read binary file
load_model=pickle.load(open("random_forest_model1.pkl","rb"))
y_preds1=load_model.predict(X_test)
accuracy_score(y_test,y_preds1)

0.8524590163934426

In [15]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
253,67,1,0,100,299,0,0,125,1,0.9,1,2,2
119,46,0,0,138,243,0,0,152,1,0.0,1,0,2
103,42,1,2,120,240,1,1,194,0,0.8,0,0,3
209,59,1,0,140,177,0,1,162,1,0.0,2,1,3
38,65,0,2,155,269,0,1,148,0,0.8,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,77,1,0,125,304,0,0,162,1,0.0,2,3,2
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2
118,46,0,1,105,204,0,1,172,0,0.0,2,0,2


In [16]:
y_train

253    0
119    1
103    1
209    0
38     1
      ..
238    0
125    1
75     1
118    1
47     1
Name: target, Length: 242, dtype: int64

In [17]:
type(X_train)

pandas.core.frame.DataFrame

### Make sure it is numerical 

In [18]:
car_sales=pd.read_csv("car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [19]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [20]:
# Split into X & y

In [21]:
X=car_sales.drop("Price",axis=1)
y=car_sales["Price"]
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [30]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=100)

In [23]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_X=transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [24]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [31]:
#Let us refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test=train_test_split(transformed_X,y,test_size=0.2)
model.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [32]:
model.score(X_test,y_test)

0.3235867221569877

## What if there are missing values? 
1. Fill them with some value (known as imputation)
2. Remove the samples with missing values 


In [41]:
car_sales_missing= pd.read_csv("car-sales-extended-missing-data.csv")

In [42]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [44]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [45]:
#Filling the "Make" Column
car_sales_missing["Make"].fillna("missing",inplace=True)
#Filling the "Colour" Column
car_sales_missing["Colour"].fillna("missing",inplace=True)
#Filling the "Odometer (KM)" Column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(),inplace=True)
#Filling the "Doord" Column
car_sales_missing["Doors"].fillna(4,inplace=True)

In [70]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [47]:
#Remove rows with missing Price values 
car_sales_missing.dropna(inplace=True)

In [71]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [49]:
len(car_sales_missing)

950

In [73]:
X=car_sales_missing.drop("Price",axis=1)
y=car_sales_missing["Price"]
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_X=transformer.fit_transform(X)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [52]:
X_train, X_test, y_train, y_test=train_test_split(transformed_X,y,test_size=0.2)
model.fit(X_train,y_train)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [53]:
model.score(X_test,y_test)

0.10372982918315198

## Choosing the right estimator/algorithm/model for our problem
Go to https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
Tips for classification/regression problem 
1. If you have structured data, use ensemble methods such as random forest
2. If you have unstructured data, use deep learning or transfer learning 

In [54]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [58]:
y_preds1=model.predict(X_test)
y_preds1

array([15176.89625   , 23160.89      , 22633.35      , 15564.34      ,
       10734.1       , 33379.34      , 14627.81      , 14451.335     ,
        8270.11      , 13981.70526557, 25586.9       , 16511.38      ,
       16273.99      ,  9193.04      ,  9996.83      , 24483.94      ,
        6839.39      , 21702.59      , 10606.54      , 21740.59      ,
       19898.81      , 20603.76      , 24003.04      , 13878.71      ,
       15022.48      , 16232.66      , 22154.71      , 11001.88      ,
       45154.01      , 18889.65      , 12517.38      , 12651.23      ,
       11370.26      , 26588.54      ,  8606.91      ,  9417.88      ,
       10980.94875   , 10639.09      , 14983.56      , 13330.61      ,
       16783.62      , 10608.03      , 11889.64517299, 41558.93      ,
       34184.67      , 14513.86      , 23442.1       , 22568.81      ,
       15392.8       , 14125.21      , 36400.98      , 13443.99      ,
       23980.73      , 23951.18      , 12464.19      , 10549.93      ,
      