# Introduction to Scikit-Learn (sklearn)

0. An end to end sklearn workflow
1. => Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions or our data
4. Evaluating the model 
5. Improve the model
6. Save and load trained model
7. Putting it all together!

## 1. Getting the data ready

In [72]:
# Standard imports 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Three possible steps
* -1- split the data into feature (X) and label (y)
* -2- Fill (impute) missing data or disregard missing data from dataset
* -3- convert non numerical values to numerical (feature encoding)

In [73]:
heart_disease = pd.read_csv("dataset/heart-disease.csv")

In [74]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [75]:
# features (X)
X = heart_disease.drop("target", axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [76]:
# label (y)
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [77]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 1.1 Make sure it is all numerical

In [83]:
car_sales = pd.read_csv("dataset/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [84]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [80]:
len(car_sales)

1000

In [81]:
car_sales['Make'].value_counts()

Toyota    398
Honda     304
Nissan    198
BMW       100
Name: Make, dtype: int64

In [87]:
# Split into X and y
X = car_sales.drop("Price", axis=1)
y = car_sales['Price']


In [88]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# list features that are categorical
categorical_features = ["Make", "Colour", "Doors"]

# initiate one hot encoder 
# one hot encoder encodes each values in the category as a binary
# example: feature color = ["Blue", "Green", "Black"]
# then it is encoded as [[1 0 0], [0 1 0], [0 0 1]] 

one_hot = OneHotEncoder()
#initiate Column transformer
# which allows a given transfomation by the encoder to the given data 
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [89]:
from sklearn.ensemble import RandomForestRegressor

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

# choose the model
model = RandomForestRegressor()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.26556932840280334

### 1.2 Dealing with missing values

There are two possible solutions
* 1 Fill them with some value
* 2 Remove the samples with missing data.

In [108]:
# import car sales missng data
car_sales_missing = pd.read_csv("dataset/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [109]:
# Fill the missing values
car_sales_missing['Make'].fillna('missing', inplace=True)
car_sales_missing['Doors'].fillna(4, inplace=True)
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
car_sales_missing['Colour'].fillna('missing', inplace=True)


In [110]:
# remove the left missing values
car_sales_missing.dropna(inplace=True)

In [111]:
# check for nan values
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [112]:
# Try to change the data ato numbers
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [113]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# list features that are categorical
categorical_features = ["Make", "Colour", "Doors"]

# initiate one hot encoder 
# one hot encoder encodes each values in the category as a binary
# example: feature color = ["Blue", "Green", "Black"]
# then it is encoded as [[1 0 0], [0 1 0], [0 0 1]] 

one_hot = OneHotEncoder()
#initiate Column transformer
# which allows a given transfomation by the encoder to the given data 
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)


In [115]:
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y)

# choose the model
model = RandomForestRegressor()

# train the model
model.fit(X_train, y_train)

# score the accuracy
model.score(X_test, y_test)

0.2855385882765913