# The problem setup
- A friend tells you they have some data, and ask if you "can do machine learning on it"
- After discussion, you realize they have a classification task (Predicting label given features)
- You say to yourself "aha, I can use a supervised learning classifier to predict the labels from the features", and tell your friend "yes I can use machine learning for this"
- How do you proceed?








In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def load_data(n=0):
    if n==0:
        from sklearn.datasets import load_digits
        digits = load_digits()
        return digits.data, digits.target
    if n==1:
        from sklearn.datasets import load_breast_cancer
        return load_breast_cancer(return_X_y=True)
    else:
        print("Invalid dataset! {}".format(n))

        
X, y = load_data(0) # get data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) 

# scale
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [20]:
y

array([0, 1, 2, ..., 8, 9, 8])

In [34]:
from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test)



0.95

In [32]:
RandomForestClassifier(max_depth=4).fit(X_train, y_train).score(X_test, y_test)



0.8518518518518519

In [38]:
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [1,2,4,6,8,10,12],
    "n_estimators": [10, 100, 250]
}
GridSearchCV(RandomForestClassifier(), params).fit(X_train, y_train).score(X_test, y_test)



0.9777777777777777

In [39]:
gs = GridSearchCV(RandomForestClassifier(), params).fit(X_train, y_train)
gs



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [40]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
gs.cv_results_

{'mean_fit_time': array([0.01338434, 0.12972275, 0.22018814, 0.010595  , 0.10965165,
        0.26528525, 0.01318034, 0.13924837, 0.32175549, 0.01502037,
        0.15135757, 0.36860498, 0.01648847, 0.17508777, 0.44358524]),
 'std_fit_time': array([0.00232009, 0.03908497, 0.00922452, 0.00245019, 0.02333404,
        0.01803382, 0.00115622, 0.0263095 , 0.02038076, 0.00133703,
        0.00493916, 0.00262192, 0.00066761, 0.01540415, 0.04459966]),
 'mean_score_time': array([0.00205509, 0.01852862, 0.03087529, 0.00115633, 0.01338029,
        0.02945749, 0.00171129, 0.01539469, 0.03129117, 0.00264374,
        0.01467578, 0.04181393, 0.00266751, 0.01412368, 0.03498022]),
 'std_score_time': array([0.00221414, 0.00993426, 0.00236156, 0.0016353 , 0.00127884,
        0.00067228, 0.00242013, 0.00154095, 0.00210027, 0.00125626,
        0.00191588, 0.01394266, 0.00188621, 0.00132664, 0.00231681]),
 'param_max_depth': masked_array(data=[1, 1, 1, 2, 2, 2, 4, 4, 4, 6, 6, 6, 8, 8, 8],
              mask=[F

In [41]:
gs.cv_results_['mean_test_score']

array([0.51392204, 0.69769292, 0.73747017, 0.71996818, 0.82816229,
       0.8265712 , 0.84407319, 0.91010342, 0.91408115, 0.89976134,
       0.95147176, 0.94590294, 0.92362768, 0.95544948, 0.96260939,
       0.93874304, 0.96499602, 0.96499602, 0.93158313, 0.97136038,
       0.96499602])

In [42]:
gs.cv_results_['mean_test_score'].shape

(21,)