# Introduction to Scikit-Learn

### this notebook shows basic functions for scikit-learn

## Topics covered are:
0. End-to-end scikit-learn workflow
1. getting the data ready
2. Choosing the right estimator/algorithm for problem set
3. fitting the model/algorithm and using it to make predictions on data
4. evaluating a model
5. improving a model
6. saving and loading a trianed model
7. putting it all together

In [1]:
covered_topics = [
"0. End-to-end scikit-learn workflow",
"1. getting the data ready",
"2. Choosing the right estimator/algorithm for problem set",
"3. fitting the model/algorithm and using it to make predictions on data",
"4. evaluating a model",
"5. improving a model",
"6. saving and loading a trianed model",
"7. putting it all together"
]

In [2]:
#standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 0. End-to-end scikit-learn workflow

In [3]:
#1. getting the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
#Create x (features matrix)
x = heart_disease.drop("target", axis=1)

#create y (labels)
y = heart_disease["target"]


In [5]:
#2. Choosing the right model and hyperparameters 
from sklearn.ensemble import RandomForestClassifier

#clf stands for classifier
clf = RandomForestClassifier(n_estimators=100)

#keep default hyperparameters, except n_estimators, that hyperparameter is set to 100 to remove error
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
#3. Fit the model to the data
from sklearn.model_selection import train_test_split

#split the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)


In [7]:
#find patterns in training data using classifier
clf.fit(x_train, y_train);

In [8]:
#make a prediction
y_preds = clf.predict(x_test)
y_preds

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0])

In [9]:
#4. Evaluate the model on training data
clf.score(x_train, y_train)

1.0

In [10]:
#4. Evaluate the model on testing data
clf.score(x_test, y_test)

0.8032786885245902

In [11]:
#more ways to evaluate data with predictions and testing data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79        32
           1       0.74      0.90      0.81        29

    accuracy                           0.80        61
   macro avg       0.81      0.81      0.80        61
weighted avg       0.82      0.80      0.80        61



In [12]:
confusion_matrix(y_test, y_preds)

array([[23,  9],
       [ 3, 26]])

In [13]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [14]:
#5. improve the model
#try different amount of n_estimators (a hyperparameter)
np.random.seed(42)
for i in range(10,100, 10):
    print(f"trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test) * 100:.2f}%")

trying model with 10 estimators...
Model accuracy on test set: 81.97%
trying model with 20 estimators...
Model accuracy on test set: 80.33%
trying model with 30 estimators...
Model accuracy on test set: 80.33%
trying model with 40 estimators...
Model accuracy on test set: 83.61%
trying model with 50 estimators...
Model accuracy on test set: 80.33%
trying model with 60 estimators...
Model accuracy on test set: 83.61%
trying model with 70 estimators...
Model accuracy on test set: 83.61%
trying model with 80 estimators...
Model accuracy on test set: 81.97%
trying model with 90 estimators...
Model accuracy on test set: 85.25%


In [15]:
#6. Save a model and load it
#pickle can be used to save and load created models
import pickle
#save the classifier/model as a binary (wb) file
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [16]:
#load model, then test to make sure it worked
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

0.8524590163934426

## 1. Getting the data ready to be used with Machine Learning

#### Three main things to do:
 1. Split data into features and labels (usually called x and y)
 2. Converting non-numerical values to numerical values (also called feature encoding)
 3. Filling (also called imputing) or disregarding missing values

In [17]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
# 1.1 Split the data into features and labels
x = heart_disease.drop("target", axis=1) #axis 1 is the row
y = heart_disease["target"]


In [19]:
#Split the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## 1.2 Convert non-numerical data to numerical values (feature encoding)

In [20]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [21]:
#turn non-numerical datatypes into numerical datatypes
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [22]:
#split data into x and y
x = car_sales.drop("Price",axis=1)
y = car_sales["Price"]

In [23]:
#split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.20)


In [24]:
#build a machine learning model

#randomforestregressor is used for predicting a number, randomforestclassifier is used to predict a category
from sklearn.ensemble import RandomForestRegressor

#choose a model
model = RandomForestRegressor()

#train the model on the training data sets
model.fit(x_train, y_train)

#test the model with the testing data
model.score(x_test, y_test)

# ---- ERROR ---- this won't work because we haven't transformed the data into all numerical data

ValueError: could not convert string to float: 'Toyota'

In [28]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [29]:
#OneHotEncoder is used to turn non-numerical categories into numerical values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#create a list of features that need to be encoded
categorical_features = ["Make", "Colour", "Doors"]

#select encoder
one_hot = OneHotEncoder()

#create a transformer and input information for transforming
transformer = ColumnTransformer([("one_hot", #name of encoder for referrence
                                 one_hot, #actual encoder
                                 categorical_features)], #list of features to encode
                               remainder="passthrough") #allows for all other features to continue uneffected


#create a new variable to hold the transformed version of the x dataframe
transformed_x = transformer.fit_transform(x)
transformed_x

pd.DataFrame(transformed_x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [30]:
#refit the model after encoding/transforming data to numerical only data

np.random.seed(42)
# split data
                                                    #transformed x instead of normal x
x_train, x_test, y_train, y_test = train_test_split(transformed_x,y, test_size=.2)

#fit the data to model
model.fit(x_train, y_train)

#score the model
model.score(x_test, y_test)

0.3235867221569877

## 1.3 Filling in missing values in a data set

##### 2 main ways of dealing with missing data
1. Fill missing data with values (also known as imputation)
2. Remove the samples with missing data

In [31]:
#import data
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [32]:
#.isna().sum() will show tht total number of missing values per column within the data set
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [33]:
#create x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [34]:
#try to convert data to numbers


#OneHotEncoder is used to turn non-numerical categories into numerical values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#create a list of features that need to be encoded
categorical_features = ["Make", "Colour", "Doors"]

#select encoder
one_hot = OneHotEncoder()

#create a transformer and input information for transforming
transformer = ColumnTransformer([("one_hot", #name of encoder for referrence
                                 one_hot, #actual encoder
                                 categorical_features)], #list of features to encode
                               remainder="passthrough") #allows for all other features to continue uneffected


#create a new variable to hold the transformed version of the x dataframe
transformed_x = transformer.fit_transform(x)
transformed_x

pd.DataFrame(transformed_x).head()

#ERROR, DATASET CONTAINS VALUES WITHIN DATA SET THAT ARE NOT NUMBERS OR STRINGS (NaN)

ValueError: Input contains NaN

#### Option 1 to fix: fill missing data with pandas values


In [35]:
# Fill the "Make" column (categorical column) with a simple text stating data is missing
car_sales_missing["Make"].fillna("missing", inplace=True)

#Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

#Fill the "Odometer" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

#find the most common amount of doors with
#car_sales_missing["Doors"].value_counts() 

#Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)


In [36]:
#check out dataframe again
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [37]:
#Remove rows with missing Price values
#since the only places that are missing data are rows with no Price, dropna() will default to drop all of those rows
car_sales_missing.dropna(inplace=True)

In [38]:
#no more missing data
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [39]:
#try to make machine learning model with newly filled data

#first split x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [40]:

#OneHotEncoder is used to turn non-numerical categories into numerical values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#create a list of features that need to be encoded
categorical_features = ["Make", "Colour", "Doors"]

#select encoder
one_hot = OneHotEncoder()

#create a transformer and input information for transforming
transformer = ColumnTransformer([("one_hot", #name of encoder for referrence
                                 one_hot, #actual encoder
                                 categorical_features)], #list of features to encode
                               remainder="passthrough") #allows for all other features to continue uneffected


#create a new variable to hold the transformed version of the x dataframe
                                          #car_sales_missing data can now be used since it has no missing values
transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

pd.DataFrame(transformed_x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0


note:

Once your data is all in numerical format, there's one more transformation you'll probably want to do to it.

It's called Feature Scaling.

In other words, making sure all of your numerical data is on the same scale.

Normalization (also called min-max scaling) - This rescales all the numerical values to between 0 and 1, with the lowest value being close to 0 and the highest previous value being close to 1. Scikit-Learn provides functionality for this in the MinMaxScalar class.

Standardization - This subtracts the mean value from all of the features (so the resulting features have 0 mean). It then scales the features to unit variance (by dividing the feature by the standard deviation). Scikit-Learn provides functionality for this in the StandardScalar class.

#### Option 2. Fill missing values with Scikit-Learn

In [41]:
#import data
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [42]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [43]:
#drop all rows with no price value
car_sales_missing.dropna(subset=["Price"], inplace = True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [44]:
#Split into x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]


In [45]:
x.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [48]:
#Filling missing values with scikit-learn simpleImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Fill categorical values with 'missing' and numerical with mean of data

#strategy being constant means keep the fill_value for every missing 
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
numerical_imputer = SimpleImputer(strategy="mean")

#Define columns
cat_features=["Make", "Colour"]
door_feature = ["Doors"]
numerical_feature = ["Odometer (KM)"]

#Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("numerical_imputer", numerical_imputer, numerical_feature)
])

#Transform the data
filled_x = imputer.fit_transform(x)

filled_x

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)