# Introduction to ScikitLearn (sklearn- indepth)
This notebook demonstrates some of the most useful functions of the SKLean library

In [1]:
what_to_cover = [
"0. An end-to-end sklearn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluating a model",
"5. Improve the model",
"6. Save and load the trained model",
"7. Putting it all together!" ]
what_to_cover

['0. An end-to-end sklearn workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator/algorithm for our problems',
 '3. Fit the model/algorithm and use it to make predictions on our data',
 '4. Evaluating a model',
 '5. Improve the model',
 '6. Save and load the trained model',
 '7. Putting it all together!']

In [2]:
# Imports--
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVC

## 1. Getting our data ready to be used with Machine Learning
    Three main things to do:
        1. Split data into features and labels ('X' and 'y').
        2. Converting non-numerical values to numerical values (aka feature encoding)
        3. Filling (aka imputing) or disregarding missing values.
        

In [3]:
heart_disease = pd.read_csv("./data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


### 1.1 Split data into features and labels

In [4]:
X = heart_disease.drop("target", axis = 1) #  every column except target. 
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [5]:
y= heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [6]:
# Split the data into training and test sets. (imported train_test_split for this.) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.2 Make sure it is all numerical

In [8]:
car_sales= pd.read_csv("./data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [9]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [10]:
#UNDERSTANDING WHY WE NEED NUMERICAL
#split into X/y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
#Split into training and testing
    # X_train, X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2)

#Build ML model-
    # imported RandomForestRegressor for this

    # model = RandomForestRegressor()
    # model.fit(X_train, y_train)
    # model.score(X_test, y_test)
#THis will yeild a ValueError: could not convert string to float: 'Honda'

In [11]:
#CONVERT data to NUMBERS
#imported OneHotEncoder & ColumnTransformer for this
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], 
                                remainder = "passthrough")
transformed_X =transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [12]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


### 1.3 Filling- Deal with missing values
1. Fill them with some value (Imputation)
2. Remove samples with missing data.

In [13]:
car_sales_missing = pd.read_csv("./data/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [14]:
car_sales_missing.isna().sum() #total missing values

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [15]:
#THIS WILL THROW AN ERROR FORM SOME OLDER SKLEARN VERSIONS
#it doesnt, for me.
#split into X/y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]
#CONVERT data to NUMBERS
#imported OneHotEncoder & ColumnTransformer for this
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], 
                                remainder = "passthrough")
transformed_X =transformer.fit_transform(X)
transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

### Option 1: Fill missing data with Pandas

In [16]:
#FIll make column
car_sales_missing["Make"].fillna("missing", inplace = True)

#Fill Colour column
car_sales_missing["Colour"].fillna("missing", inplace = True)

#Fill Odometer
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace = True)

#FIll Doors
car_sales_missing["Doors"].fillna(4, inplace = True)

In [17]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [18]:
# Remove rows with missing price value
car_sales_missing.dropna(inplace = True)

In [19]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [20]:
len(car_sales_missing)

950

In [21]:
#THIS WILL THROW AN ERROR FORM SOME OLDER SKLEARN VERSIONS
#it doesnt, for me.
#split into X/y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]
#CONVERT data to NUMBERS
#imported OneHotEncoder & ColumnTransformer for this
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], 
                                remainder = "passthrough")
transformed_X =transformer.fit_transform(X)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

### Option 2: Fill missing values with SKLearn

In [22]:
car_sales_missing = pd.read_csv("./data/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [23]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [24]:
car_sales_missing.dropna(subset=["Price"], inplace= True)
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [25]:
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [26]:
    # Split into X/y
X= car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]
X,y

(       Make Colour  Odometer (KM)  Doors
 0     Honda  White        35431.0    4.0
 1       BMW   Blue       192714.0    5.0
 2     Honda  White        84714.0    4.0
 3    Toyota  White       154365.0    4.0
 4    Nissan   Blue       181577.0    3.0
 ..      ...    ...            ...    ...
 995  Toyota  Black        35820.0    4.0
 996     NaN  White       155144.0    3.0
 997  Nissan   Blue        66604.0    4.0
 998   Honda  White       215883.0    4.0
 999  Toyota   Blue       248360.0    4.0
 
 [950 rows x 4 columns],
 0      15323.0
 1      19943.0
 2      28343.0
 3      13434.0
 4      14043.0
         ...   
 995    32042.0
 996     5716.0
 997    31570.0
 998     4001.0
 999    12732.0
 Name: Price, Length: 950, dtype: float64)

In [27]:
#Fill missing values with sk-learn
# SimpleImputer & ColumnTransformer for this.

# Fill categorical values with "missing" and numerical values with mean
cat_imputer = SimpleImputer(strategy = "constant", fill_value= "missing")
door_imputer = SimpleImputer(strategy="constant", fill_value= 4)
numerical_imputer= SimpleImputer(strategy= "mean")

#Define columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
numerical_features= ["Odometer (KM)"]

#Create an imputer (something that fills missing data.)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("numerical_imputer", numerical_imputer, numerical_features)
])
#Transform data
filled_X= imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [28]:
car_sales_filled = pd.DataFrame(filled_X, 
                                columns= ["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


In [29]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [30]:
#Split car_Sales_filled in X/y

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], 
                                remainder = "passthrough")
transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [31]:
#Now we have our data as numbers
#lets fit a model
# using randomforestregressor, train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size= 0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.17828009063383765

In [32]:
what_to_cover

['0. An end-to-end sklearn workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator/algorithm for our problems',
 '3. Fit the model/algorithm and use it to make predictions on our data',
 '4. Evaluating a model',
 '5. Improve the model',
 '6. Save and load the trained model',
 '7. Putting it all together!']

## 2. Choosing the right estimator/algorithm/model for our problems

* Classification: Predicting whether a sample is one thing or another.
* Regression: Predicting a number

### 2.1 Picking machine learning model for a regresssion problem


In [33]:
#Using boston housing datasset for this load_boston
boston = load_boston()
bostonDF = pd.DataFrame(boston["data"], columns= boston["feature_names"])
bostonDF["target"] = pd.Series(boston["target"])
bostonDF

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [34]:
len(bostonDF)

506

In [35]:
#trying the Ridge Regression Model
#set up random seed
np.random.seed(42)
X= bostonDF.drop("target", axis = 1)
y = bostonDF["target"]
#split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2)

#Instantiate ridge model
model = Ridge()
model.fit(X_train, y_train)

#check score of ridge on test
model.score(X_test, y_test)

0.6662221670168522

In [36]:
# how to make this better?
#what if ridge wasnt working?
#we would use Ensemble models- like RandomForestRegressor or RandomForestClassifier
#Ensemble models is a machine learning approach to combine multiple other models in the prediction process.

In [37]:
np.random.seed(42)
X= bostonDF.drop("target", axis=1)
y= bostonDF["target"]
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size= 0.2)
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8654448653350507

## 2.2 Picking machine learning model for a classification problem

In [38]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [39]:
# Consulting map. it says to try LinearSVC- imported

In [40]:
np.random.seed(42)
X= heart_disease.drop("target", axis= 1)
y = heart_disease["target"]
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2)
clf = LinearSVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)



0.8688524590163934

In [41]:
#Using Ensemble Methods- RandomForestClassifier
np.random.seed(42)
X= heart_disease.drop("target", axis=1)
y = heart_disease["target"]
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size= 0.2)
clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)

0.8524590163934426

Tidbit: 
    1. if you have structured data use ensemble methods
    2. If you have unstructured data, use deep learning or transfer learning.

## 3. Fit the model/algorithm on our data and use it to make predictions.

### 3.1 Fitting the model to the data
X = features, feature variables, data. 
y = labels, targets, target variables

In [None]:
#Using Ensemble Methods- RandomForestClassifier
np.random.seed(42)
X= heart_disease.drop("target", axis=1)
y = heart_disease["target"]
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size= 0.2)
clf1 = RandomForestClassifier()
#fit the model into the data- Looking at X-data and figuring out we got to y-values
clf1.fit(X_train, y_train)
#Use patterns the model has learned.
clf1.score(X_test, y_test)

### 3.2 Making predictions machine learning models.