## Introduction to Scikit-Learn

## 0. An end-to-end Scikit-Learn workflow

In [2]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [64]:
# 1. Get the data ready
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
# Create X (features matrix)
X = heart_disease.drop("target", axis=1)

# Create Y (labels)
y = heart_disease["target"]

In [50]:
# 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [51]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [62]:
# Fit the model - the model finding patterns in the training data
clf.fit(X_train, y_train);

In [53]:
# make a prediction
y_preds = clf.predict(X_test)
y_preds

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

In [54]:
# 4. Evaluate the model on the training and test data
clf.score(X_train, y_train)

1.0

In [55]:
clf.score(X_test, y_test)

0.7540983606557377

In [56]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.79      0.71      0.75        31
           1       0.73      0.80      0.76        30

    accuracy                           0.75        61
   macro avg       0.76      0.75      0.75        61
weighted avg       0.76      0.75      0.75        61



In [57]:
confusion_matrix(y_test, y_preds)

array([[22,  9],
       [ 6, 24]], dtype=int64)

In [58]:
accuracy_score(y_test, y_preds)

0.7540983606557377

In [59]:
# 5. Improve a model
# Try different amount of estimators
np.random.seed(0)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators
Model accuracy on test set: 75.41%

Trying model with 20 estimators
Model accuracy on test set: 80.33%

Trying model with 30 estimators
Model accuracy on test set: 81.97%

Trying model with 40 estimators
Model accuracy on test set: 80.33%

Trying model with 50 estimators
Model accuracy on test set: 81.97%

Trying model with 60 estimators
Model accuracy on test set: 80.33%

Trying model with 70 estimators
Model accuracy on test set: 77.05%

Trying model with 80 estimators
Model accuracy on test set: 80.33%

Trying model with 90 estimators
Model accuracy on test set: 81.97%



In [60]:
# 6. Save a model and load it 
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [61]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.819672131147541

## 1. Getting our data ready to be used with machine learning
Three main things we have to do:
    
        1. Split the data into features and labels(usually `X` and `y`)
        2. Filling (also called imputing) or disregaring missing values
        3. Converting non-numerical values to numerical values (also called feature encoding)

In [65]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [72]:
X = heart_disease.drop("target", axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [67]:
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [69]:
# Split the data into training data and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [70]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

## 1.1  Make sure all data is numerical

In [2]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
len(car_sales)

1000

In [4]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [5]:
# Split into X and y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

# Split into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
# model.fit(X_train, y_train)
# model.score(X_test, y_test)

In [7]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [8]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [9]:
pd.DataFrame(transformed_X)[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [18]:
dummies = pd.get_dummies(car_sales[["Make", "Doors", "Colour"]])
dummies[:5]

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [11]:
# Refit the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model.fit(X_train, y_train);

In [12]:
# Evaluate
model.score(X_test, y_test)

0.3235867221569877

## 1.2 What if there are missing values?

1. Fill them with some value (also known as imputation)
2. Remove the samples with missing data altogether

In [2]:
# Import the data
car_sales_missing = pd.read_csv("data/car-sales-missing-data.csv")
car_sales_missing.head(12)

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,,4.0,"$4,500"
6,Honda,,,4.0,"$7,500"
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,"$9,700"


In [3]:
car_sales_missing.isna().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [4]:
car_sales_missing["Odometer"].value_counts()

31600.0     1
213095.0    1
60000.0     1
150043.0    1
11179.0     1
87899.0     1
Name: Odometer, dtype: int64

In [5]:
# Create X and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [6]:
# Convert data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")
transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [None]:
pd.DataFrame(transformed_X)

### Option 1: Fill missing data with Pandas

In [None]:
# Fill the "Make" column 
car_sales_missing["Make"].fillna("missing", inplace = True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace = True)

# Fill the "Odometer (KM)" column 
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(), inplace = True)

# Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace = True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing["Odometer"] = car_sales_missing["Odometer"].round(2)

In [None]:
car_sales_missing.head(12)

In [None]:
# Remove rows with missing value
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.head(12)

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [7]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [8]:
# Convert data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")
transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [9]:
pd.DataFrame(transformed_X)

NameError: name 'transformed_X' is not defined

### Option 2: Fill missing values with Scikit-Learn

In [10]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [11]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [12]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

# Split into X and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# Split data into train and test
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2)

In [14]:
# Check the missing values
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [15]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test data separately
# fit_transform imputes the missing values from the training set and fills them simultaneously
filled_X_train = imputer.fit_transform(X_train) 
# tranform takes the imputing missing values from the training set and fills the test set with them
filled_X_test = imputer.transform(X_test) 

# Check filled X_train
filled_X_train

array([['Toyota', 'Green', 4.0, 61651.0],
       ['Honda', 'White', 4.0, 43437.0],
       ['Honda', 'Blue', 4.0, 163297.0],
       ...,
       ['Toyota', 'Black', 4.0, 147540.0],
       ['Honda', 'White', 4.0, 31418.0],
       ['Honda', 'White', 4.0, 40134.0]], dtype=object)

In [16]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [17]:
# Check missing data in test set
car_sales_filled_test.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [18]:
# Check to see the original... still missing values
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [19]:
# Import OneHotEncoder class from sklearn
from sklearn.preprocessing import OneHotEncoder

# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train) # fit and transform the training data
transformed_X_test = transformer.transform(car_sales_filled_test) # transform the test data

# Check transformed and filled X_train
# transformed_X_train.toarray()

In [20]:
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

# Setup model
model = RandomForestRegressor()

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.09238288952060358

## 2. Choosing the right estimator or algorithm for our problem
 Scikit-Learn uses estimator as another term for machine learning model or algorithm
* Classification - predicting whether a sample is one thing or another
* Regression - predicting a number

### 2.1 Picking a machine learning model for a regression problem

In [15]:
# Import Boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [16]:
# Get the data into a pandas dataframe for good visualition

boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [9]:
# How many samples
len(boston_df)

506

In [10]:
# Trying the Ridge Regression model
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Setup random seed
np.random.seed(33)

# Create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate the ridge model
model = Ridge()
model.fit(X_train, y_train)

# Check the score of the Ridge model on test data
model.score(X_test, y_test)

0.6833396417695794

How do we improve this score?

What if Ridge Regression wasn't working 

In [32]:
# Try the Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(10)

# Create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split the data into trian and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Check for the score of the random forest regressor
model.score(X_test, y_test)

0.8621300248033641

### 2.2 Choose an estimator for a classification problem
`Note`: 
    
    * If you have structured data, use ensemble methods
    * If you have unstructured data, use deep learning or transfer learning

In [3]:
# Get the data
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head();

In [36]:
# Check for null values in the data
heart_disease.isna().sum();

In [37]:
# Check for the length of the data
len(heart_disease)

303

In [40]:
# heart_disease.head(10)

In [65]:
# Trying using SVM model
from sklearn import svm 

# Create X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model 
clf = svm.SVC()
clf.fit(X_train, y_train)

# Check the score of the model
clf.score(X_test, y_test)

0.6721311475409836

In [69]:
# Using linear svc
from sklearn.svm import LinearSVC

np.random.seed(42)

# Create X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model 
clf = LinearSVC(max_iter=10000000)
clf.fit(X_train, y_train)

# Check the score of the model
clf.score(X_test, y_test)

0.8688524590163934

In [76]:
# Using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Create X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model 
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Check the score of the model
clf.score(X_test, y_test)

0.8524590163934426

## 3. Fit the model/algorithm on the data and use it to make predictions

### 3.1 Fitting the model

In [7]:
# Using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model (find the patterns in the dataset - training the machine learning model)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Check the score of the model (Use the patterns the model has learned)
clf.score(X_test, y_test)

0.8524590163934426

### 3.2 Make predictions using the machine learning model
2 ways to make predictions:
    1. `predict()`
    2. `predict_proba()`

In [9]:
# Use a trained model to make predictions
X_test[:5]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [10]:
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [11]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [12]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.8524590163934426

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

 Make predictions with `predict_proba()`

In [14]:
# predict_proba returns probabilities of a classification label
clf.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [16]:
clf.predict(X_test[:5])

array([0, 1, 1, 0, 1], dtype=int64)

`predict` can also be used for regression models

In [28]:
# Try the Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(10)

# Create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split the data into trian and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Check for the score of the random forest regressor
model.score(X_test, y_test)

0.8621300248033641

In [29]:
# Make predictions
y_preds = model.predict(X_test)

In [31]:
y_preds[:10]

array([24.929, 29.174, 25.804, 28.951, 18.393, 15.041, 45.169, 15.347,
       20.814, 46.461])

In [33]:
np.array(y_test[:10])

array([28.4, 31.1, 23.5, 26.6, 19.6, 14.3, 50. , 14.3, 20.7, 37.6])

In [35]:
# Compare the truths
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

2.691519607843138

## 4. Evaluating a machine learning model
Three ways to evaluate Scikit-Learn models/estimators:
1. Estimator `score` method
2. The `scoring` parameter
3. Problem-specific metric funtions

### 4.1 Evaluate a model with the `score` method

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
np.random.seed(44)

# Get the data
heart_disease = pd.read_csv("data/heart-disease.csv")

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.819672131147541

In [14]:
clf.score(X_train, y_train)

1.0

In [17]:
# Try the Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(10)

# Create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split the data into trian and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Check for the score of the random forest regressor
model.score(X_test, y_test)

0.8621300248033641

### 4.2 Evaluating using the `scoring` method

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
np.random.seed(44)

# Get the data
heart_disease = pd.read_csv("data/heart-disease.csv")

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train);

In [20]:
clf.score(X_test, y_test)

0.819672131147541

In [22]:
cross_val_score(clf, X, y, cv=10)

array([0.87096774, 0.80645161, 0.83870968, 0.93333333, 0.9       ,
       0.8       , 0.8       , 0.86666667, 0.73333333, 0.83333333])

In [29]:
np.random.seed(22)

# Single training and test split
clf_single_score = clf.score(X_test, y_test)

# Mean of 5-fold cross validation score
clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5))

# Compare the two
clf_single_score, clf_cross_val_score

(0.819672131147541, 0.8183060109289617)

### 4.2.1 Classification model evaluation metrics
1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

**Accuracy**

In [35]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(22)

X = heart_disease.drop("target", axis=1)
y =heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

clf_cross_val_score = cross_val_score(clf, X, y, cv=5);

In [31]:
np.mean(clf_cross_val_score)

0.8183060109289617

In [38]:
print(f"Heart Disease Classifier Cross Validated Accuracy is: {np.mean(clf_cross_val_score) * 100:.2f}% ")

Heart Disease Classifier Cross Validated Accuracy is: 82.81% 


**Area under receiver operating characteristic curve (AUC/ROC)**