# Introduction to Scikit-Learn (sklearn)

0. An end to end sklearn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions or our data
4. Evaluating the model 
5. =>Improve the model
6. Save and load trained model
7. Putting it all together!

## 5. Improve the model
### There are three ways of tunning parameters
1. By Hand
2. #Randomly with RandomSearchCV
3. Exhaustively by GridSearchCV

In [2]:
## how to get hyperparameters in the model?
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

model = RandomForestRegressor()
(model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

#### Tunning by RandomizedSearchCV

We're going to adjust:
- max_depth
- max_features
- min_samples_leaf
- min_samples_split
- n_estimators

In [3]:
from sklearn.metrics import (accuracy_score,
                             f1_score, 
                             recall_score,
                             precision_score,
                             r2_score, 
                             mean_absolute_error, 
                             mean_squared_error)


def evaluate_classiification_model(y_test, y_pred):
    
    accuracy = accuracy_score(y_test, y_pred) * 100
    precission = precision_score(y_test, y_pred) * 100
    recall = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    
    metric_dict = {'accuracy': round(accuracy, 2),
                  'precission': round(precission, 2),
                  'recall': round(recall, 2),
                  'f1': round(f1, 2)}
    [print(each+":", metric_dict[each]) for each in metric_dict.keys()]
    return metric_dict
    
def evaluate_regression_model(y_test, x_pred):
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    metric_dict = {'r2': round(r2, 2),
                  'mae': round(mae, 2),
                  'mse': round(mse, 2)}
    [print(each+":", metric_dict[each]) for each in metric_dict.keys()]
    return metric_dict


In [4]:
from sklearn.ensemble import RandomForestClassifier

# load the data 
heart_disease = pd.read_csv("dataset/heart-disease.csv")

# split into X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']


In [5]:
# split the data into tain, validation and test sets
np.random.seed(42)

def train_valid_test_split(X, y, split_ratio = [0.7, .15, .15]):
    X['label'] = y
    X = X.sample(frac=1)
    y = X['label']
    X = X.drop('label', axis=1)
    length = len(X)
    train_ratio, valid_ratio, split_ratio = split_ratio
    train_split = round(train_ratio * length)
    valid_split = round(train_split + valid_ratio * length)
    
    X_train, y_train = X[: train_split], y[:train_split]
    X_valid, y_valid = X[train_split: valid_split], y[train_split: valid_split]
    X_test, y_test = X[valid_split:], y[valid_split:]
    
    return [X_train, y_train, X_valid, y_valid, X_test, y_test]
    

In [6]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X, y)

In [7]:
np.random.seed(42)

# choose a model
clf = RandomForestClassifier()

# train the data by fitting
clf.fit(X_train, y_train)

# make baseline predictions
y_preds = clf.predict(X_valid)

# Evaluate the classifier on validation set
baseline_metrices = evaluate_classiification_model(y_valid, y_preds) 


accuracy: 82.22
precission: 84.0
recall: 84.0
f1: 84.0


In [None]:
from sklearn.model_selection import RandomizedSearchCV


