# The problem setup
- A friend tells you they have some data, and ask if you "can do machine learning on it"
- After discussion, you realize they have a classification task (Predicting label given features)
- You say to yourself "aha, I can use a supervised learning classifier to predict the labels from the features", and tell your friend "yes I can use machine learning for this"
- How do you proceed?

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def load_data(n=0):
    if n==0:
        from sklearn.datasets import load_digits
        digits = load_digits()
        return digits.data, digits.target
    else:
        print("Invalid dataset! {}".format(n))

        
X, y = load_data() # get data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) 

# scale
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [7]:
y

array([0, 1, 2, ..., 8, 9, 8])

In [13]:
from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test)



0.9407407407407408

In [14]:
RandomForestClassifier(max_depth=2).fit(X_train, y_train).score(X_test, y_test)



0.7703703703703704

In [16]:
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [1,2,4,6,8],
    "n_estimators": [10, 100, 250]
}
GridSearchCV(RandomForestClassifier(), params).fit(X_train, y_train).score(X_test, y_test)



0.9740740740740741

In [17]:
gs = GridSearchCV(RandomForestClassifier(), params).fit(X_train, y_train)
gs



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [18]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
gs.cv_results_

{'mean_fit_time': array([0.01396569, 0.08177098, 0.1948123 , 0.00930866, 0.09906753,
        0.23038324, 0.01230065, 0.11334205, 0.28557054, 0.01429494,
        0.15491851, 0.368349  , 0.01795189, 0.16257612, 0.40525007]),
 'std_fit_time': array([3.55198391e-03, 2.95606074e-03, 5.54401896e-03, 4.70134046e-04,
        5.44103545e-03, 2.93752498e-03, 4.70249702e-04, 9.55386552e-04,
        5.42207937e-03, 4.70134046e-04, 4.01698491e-03, 1.87963705e-03,
        2.97360213e-07, 2.94131505e-03, 6.53295330e-03]),
 'mean_score_time': array([0.00165876, 0.01098029, 0.02692787, 0.00166273, 0.01263301,
        0.03126009, 0.00166186, 0.01264413, 0.03025182, 0.00199477,
        0.01462785, 0.03326503, 0.00166178, 0.01396203, 0.03291114]),
 'std_score_time': array([9.42355787e-04, 1.38807034e-05, 1.12391596e-07, 4.70696648e-04,
        4.70304900e-04, 2.61245389e-03, 4.69067548e-04, 4.77162939e-04,
        9.39930918e-04, 1.12391596e-07, 9.40795692e-04, 4.79688240e-04,
        4.69853319e-04, 1.03

In [20]:
gs.cv_results_['mean_test_score']

array([0.51710422, 0.72076372, 0.72951472, 0.69689737, 0.81384248,
       0.83452665, 0.85521082, 0.90771679, 0.92044551, 0.90771679,
       0.94351631, 0.9451074 , 0.9132856 , 0.95544948, 0.95465394])