In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
candy = pd.read_csv('../candy-data.csv')
X = candy.drop(['competitorname', 'winpercent'], axis=1).values
y = candy['winpercent'].values

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.8, random_state=111)

In [6]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=25, random_state=1111)
rfr.fit(X_train, y_train)

RandomForestRegressor(n_estimators=25, random_state=1111)

In [10]:
# Review the parameters of rfr
print(rfr.get_params())

# Maximum Depth
max_depth = [4, 8, 12]

# Minimum samples for a split
min_samples_split = [2, 5, 10]

# Max features 
max_features = [4, 6, 8, 10]

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 25, 'n_jobs': None, 'oob_score': False, 'random_state': 1111, 'verbose': 0, 'warm_start': False}


### Running a model using ranges

You have just finished creating a list of hyperparameters and ranges to use when tuning a predictive model for an assignment. You have used max_depth, min_samples_split, and max_features as your range variable names.
* Instructions

    * Randomly select a max_depth, min_samples_split, and max_features using your range variables.
    * Print out all of the parameters for rfr to see which values were randomly selected.


In [11]:
from sklearn.ensemble import RandomForestRegressor

# Fill in rfr using your variables
rfr = RandomForestRegressor(
    n_estimators=100,
        max_depth=np.random.choice(max_depth),
        min_samples_split=np.random.choice(min_samples_split),
        max_features=np.random.choice(max_features))

# Print out the parameters
print(rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 12, 'max_features': 8, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


### Preparing for RandomizedSearch

Last semester your professor challenged your class to build a predictive model to predict final exam test scores. You tried running a few different models by randomly selecting hyperparameters. However, running each model required you to code it individually.

After learning about RandomizedSearchCV(), you're revisiting your professors challenge to build the best model. In this exercise, you will prepare the three necessary inputs for completing a random search.
* Instructions

    * Finalize the parameter dictionary by adding a list for the max_depth parameter with options 2, 4, 6, and 8.
    * Create a random forest regression model with ten trees and a random_state of 1111.
    * Create a mean squared error scorer to use.


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# Finish the dictionary by adding the max_depth parameter
param_dist = {"max_depth": [2, 4, 6, 8],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 8, 16]}

# Create a random forest regression model
rfr = RandomForestRegressor(n_estimators=10, random_state=1111)

# Create a scorer to use (use the mean squared error)
mse = make_scorer(mean_squared_error)

### Implementing RandomizedSearchCV

You are hoping that using a random search algorithm will help you improve predictions for a class assignment. You professor has challenged your class to predict the overall final exam average score.

In preparation for completing a random search, you have created:

* `param_dist`: the hyperparameter distributions
* `rfr`: a random forest regression model
* `scorer`: a scoring method to use


In [17]:
# Import the method for random search
from sklearn.model_selection import RandomizedSearchCV

# Build a random search using param_dist, rfr, and scorer
random_search =\
    RandomizedSearchCV(
        estimator=rfr,
        param_distributions=param_dist,
        n_iter=10,
        cv=5,
        scoring=mse)

In [18]:
random_search.fit(X, y)

RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(n_estimators=10,
                                                   random_state=1111),
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'max_features': [2, 4, 6, 8, 10],
                                        'min_samples_split': [2, 4, 8, 16]},
                   scoring=make_scorer(mean_squared_error))

In [20]:
random_search.best_params_, random_search.best_score_

({'min_samples_split': 16, 'max_features': 2, 'max_depth': 2},
 157.10958128701108)

In [21]:
random_search.cv_results_

{'mean_fit_time': array([0.01565943, 0.01094456, 0.01149735, 0.01094909, 0.01079063,
        0.01075001, 0.0105875 , 0.01065345, 0.01175961, 0.01089182]),
 'std_fit_time': array([4.44095528e-03, 3.79393752e-04, 1.05372224e-03, 1.25651468e-04,
        1.40653577e-04, 9.58994569e-05, 1.39963720e-04, 8.39415444e-05,
        1.41356061e-03, 4.20329659e-04]),
 'mean_score_time': array([0.00165858, 0.00128841, 0.00156598, 0.00127311, 0.00131054,
        0.00128212, 0.00127597, 0.00130119, 0.00127263, 0.00136533]),
 'std_score_time': array([3.75253957e-04, 4.74276821e-05, 3.35578174e-04, 2.68736556e-05,
        4.86722989e-05, 5.24151412e-05, 4.43613374e-05, 8.44961095e-05,
        3.14674231e-05, 9.68088181e-05]),
 'param_min_samples_split': masked_array(data=[2, 16, 16, 4, 8, 2, 8, 2, 2, 8],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(dat

In [24]:
max_depth = [item['max_depth'] for item in random_search.cv_results_['params']]
scores = list(random_search.cv_results_['mean_test_score'])
d = pd.DataFrame([max_depth, scores]).T
d.columns = ['Max Depth', 'Score']
d.groupby(['Max Depth']).mean()

Unnamed: 0_level_0,Score
Max Depth,Unnamed: 1_level_1
2.0,147.992448
4.0,125.363437
6.0,141.466611
8.0,143.677738


## Using RandomedSearchCv with Classification problems

### Selecting the best precision model

Your boss has offered to pay for you to see three sports games this year. Of the 41 home games your favorite team plays, you want to ensure you go to three home games that they will definitely win. You build a model to decide which games your team will win.

To do this, you will build a random search algorithm and focus on model precision (to ensure your team wins). You also want to keep track of your best model and best parameters, so that you can use them again next year (if the model does well, of course). You have already decided on using the random forest classification model rfc and generated a parameter distribution param_dist.
- Instructions

    - Create a precision scorer, precision using make_scorer(<scoring_function>).
    - Complete the random search method by using rfc and param_dist.
    - Use rs.cv_results_ to print the mean test scores.
    - Print the best overall score.


In [28]:
tic_tac = pd.read_csv('../tic-tac-toe.csv')
tic_tac.head()

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [29]:
tic_tac['Class'] = tic_tac['Class'].map({
    'positive':1,
    'negative':0
})

In [30]:
 # Create dummy variables using pandas
X = pd.get_dummies(tic_tac.iloc[:,0:9])
y = tic_tac.iloc[:, 9]

In [31]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=1111)

param_dist = {'max_depth': range(2, 12, 2),
             'min_samples_split': range(2, 12, 2),
             'n_estimators': [10, 25, 50]}


In [33]:
from sklearn.metrics import precision_score, make_scorer

# Create a precision scorer
precision = make_scorer(precision_score)
# Finalize the random search
rs = RandomizedSearchCV(
  estimator=rfc, param_distributions=param_dist,
  scoring = precision,
  cv=5, n_iter=10, random_state=1111)
rs.fit(X, y)

# print the mean test scores:
print('The accuracy for each run was: {}.'.format(rs.cv_results_['mean_test_score']))

print('\n')
# print the best model score:
print('The best accuracy for a single model was: {}'.format(rs.best_score_))

The accuracy for each run was: [0.87614978 0.75561877 0.67740077 0.89141614 0.87024051 0.85772772
 0.68244199 0.82867397 0.88717239 0.91980724].


The best accuracy for a single model was: 0.9198072369317106
