## Exercise: Machine Learning Finding Optimal Model and Hyperparameters
**For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.** 

1. from sklearn import svm
2. from sklearn.ensemble import RandomForestClassifier
3. from sklearn.linear_model import LogisticRegression
4. from sklearn.naive_bayes import GaussianNB
5. from sklearn.naive_bayes import MultinomialNB
6. from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.datasets import load_digits
import pandas as pd
digit = load_digits()

In [6]:
dir(digit)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [36]:
X = digit.data
y = digit.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X)

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [40]:
model_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'para': {
            'n_estimators': [50, 100, 200],         # number of trees
            'max_depth': [None, 10, 20, 30],        # tree depth
            'min_samples_split': [2, 5, 10],        # min samples to split
            'min_samples_leaf': [1, 2, 4],          # min samples at leaf
            'max_features': ['sqrt', 'log2'],       # features considered at split
            'bootstrap': [True, False] 
        }
    },
    'LogisticRegression':{
        'model': LogisticRegression(),
        'para':{
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear', 'saga'],  # solvers that support L1
            'max_iter': [100, 200, 500]
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'para': {
            'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'para': {
            'alpha': [0.01, 0.1, 0.5, 1.0, 5.0],
            'fit_prior': [True, False]
        }
    },
    'DecisionTreeClassifier':{
        'model': DecisionTreeClassifier(),
        'para': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'sqrt', 'log2']
        }
    }
}

In [46]:
scores = []

for model_name, mp in model_params.items():
    grid = GridSearchCV(mp['model'], mp['para'], cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_params': grid.best_params_,
        'best_score': grid.best_score_,
        'test_score': grid.score(X_test, y_test)
    })

results = pd.DataFrame(scores)

In [68]:
results


Unnamed: 0,model,best_params,best_score,test_score
0,RandomForestClassifier,"{'bootstrap': False, 'max_depth': 30, 'max_fea...",0.979822,0.972222
1,LogisticRegression,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l1', '...",0.967286,0.961111
2,GaussianNB,{'var_smoothing': 1e-05},0.881032,0.894444
3,MultinomialNB,"{'alpha': 5.0, 'fit_prior': True}",0.897019,0.913889
4,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_depth': 30, 'max...",0.859432,0.9


In [80]:
results.best_params[0]

{'bootstrap': False,
 'max_depth': 30,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

Winner is **RandomForestClassifier with 97% accuracy** , with params 
{'bootstrap': False,
 'max_depth': 30,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}