# 5. Improving a model

First predictions  = baseline predictions
First model = baseline models 

From a data perspective 
* Could you collect more data?(more data,more better)
* Could you improve the data ?

From a model perspective 
* Could there be a better model to use?
* Could we improve the current model ?

Parameters vs Hyperparameters

* Parameters :- models find these patterns in data
* Hyperparameters :-settings on a model you can adjust to improve its ability to find patterns 

Ways to adjust hyperparameters

1. by hand
2. Randomnly by RandomSearchCV
3. Exhaustively by GndSearchCV

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Import boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston;

boston_df = pd.DataFrame(boston["data"],columns = boston ["feature_names"])
boston_df["target"]=pd.Series(boston["target"])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [6]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 5.1 Tuning Hyperparameters by hand

Lets make 3 sets :- training,validation and test

In [7]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
def evaluate_preds(Y_true,Y_preds):
    accuracy=accuracy_score(Y_true,Y_preds)
    precision=precision_score(Y_true,Y_preds)
    recall=recall_score(Y_true,Y_preds)
    f1=f1_score(Y_true,Y_preds)
    metric_dict = {"accuracy": round(accuracy,2),
                  "precision": round(precision,2),
                  "recall": round(recall,2),
                  "f1": round(f1,2)}
    print(f"Accuracy {accuracy*100:.2f}%")
    print(f"precision {precision:.2f}")
    print(f"recall {recall:.2f}")
    print(f"f1 {f1:.2f}")
    return metric_dict

In [8]:
heart_disease= pd.read_csv("data/original.csv")

In [9]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

#shuffle data
heart_disease_shuffle = heart_disease.sample(frac=1)

X=heart_disease_shuffle.drop("target",axis=1)
Y=heart_disease_shuffle["target"]

train_split = round(0.7* len(heart_disease_shuffle)) # 70% of data
valid_split = round(train_split+0.15 * len(heart_disease_shuffle)) # 15% of data

X_train,Y_train= X[:train_split],Y[:train_split]

X_valid,Y_valid = X[train_split:valid_split],Y[train_split:valid_split]

X_test,Y_test = X[valid_split:],Y[valid_split:]

len(X_test)

clf=RandomForestClassifier()

clf.fit(X_train,Y_train)

# make baseline predictions 
Y_preds = clf.predict(X_valid)

# evaluate the classifier on validation set
baseline_metrics = evaluate_preds(Y_valid,Y_preds)
baseline_metrics 

Accuracy 82.22%
precision 0.81
recall 0.88
f1 0.85


{'accuracy': 0.82, 'precision': 0.81, 'recall': 0.88, 'f1': 0.85}

In [10]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
np.random.seed(42)

# create a second a classfier with different hyperparameters

clf_2= RandomForestClassifier(n_estimators=1000)

clf_2.fit(X_train,Y_train)
Y_preds_2 = clf_2.predict(X_valid)

baseline_metrics_2 = evaluate_preds(Y_valid,Y_preds_2)

Accuracy 82.22%
precision 0.81
recall 0.88
f1 0.85


### 5.2 Hyperparameter tuning with randomizedsearchCV

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

grid ={"n_estimators" : [10,100,200,1000,1200],
      "max_depth" : [None,5,10,20,30],
        "max_features" : ["auto","sqrt"],
        "min_samples_split" : [2,4,6],
       "min_samples_leaf" :[1,2,4]}

np.random.seed(42)

# split into X and Y

X =heart_disease_shuffle.drop("target",axis = 1)
Y = heart_disease_shuffle["target"]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size= 0.2)

clf=RandomForestClassifier(n_jobs=1)

rs_clf =RandomizedSearchCV(estimator= clf,
                           param_distributions = grid,
                          n_iter=10,
                          cv=5,
                           verbose=2)

rs_clf.fit(X_train,Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=5 
[CV]  n_estimators=10, m

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.3s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.2s
[CV] n_estimators=10, min_samples_split=4, min_sampl

[CV]  n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20, total=   0.3s
[CV] n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20 
[CV]  n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20, total=   0.2s
[CV] n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20 
[CV]  n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20, total=   0.2s
[CV] n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20 
[CV]  n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20, total=   0.2s
[CV] n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20 
[CV]  n_estimators=100, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=20, total=   0.3s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   41.6s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [17]:
rs_clf.best_params_

{'n_estimators': 1200,
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 5}

In [22]:
rs_Y_preds = rs_clf.predict(X_test)

rs_metrics = evaluate_preds(Y_test,rs_Y_preds
                           )

Accuracy 83.61%
precision 0.78
recall 0.89
f1 0.83
