In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

    # Train:Test split
from sklearn.model_selection import train_test_split
    # Scaling
from sklearn.preprocessing import StandardScaler

In [7]:
data = pd.read_csv('../data/processed/combined')

In [8]:
X = data.drop(['fighter1', 'fighter2', 'winner_is_fighter1'],axis=1)
y = data['winner_is_fighter1']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

print('X_Train: \n\tObservations: {} \tFeatures: {} \t{}% of data'.format(X_train.shape[0], X_train.shape[1], len(X_train)/len(X)*100))
print('X_Test: \n\tObservations: {} \tFeatures: {} \t{}% of data'.format(X_test.shape[0], X_test.shape[1], len(X_test)/len(X)*100))

X_Train: 
	Observations: 3199 	Features: 45 	70.0% of data
X_Test: 
	Observations: 1371 	Features: 45 	30.0% of data


In [10]:
# https://github.com/learn-co-curriculum/dsc-pipelines-lab/tree/solution
# Construct pipeline
pipe_rf = Pipeline([('clf', RandomForestClassifier(random_state = 123,
                                                   n_jobs=2
                                                  )
                    )
                   ]
                  )

# Set grid search params
param_grid_forest = [ 
  {'clf__n_estimators': [100, 110, 120],
   'clf__criterion': ['gini'], 
   'clf__max_depth': [4, 5, 6],  
   'clf__min_samples_leaf':[0.05 ,0.1, 0.2],  
   'clf__min_samples_split':[0.05 ,0.1, 0.2]
  }
]

# Construct grid search
gs_rf = GridSearchCV(estimator=pipe_rf,
            param_grid=param_grid_forest,
            scoring='accuracy',
            cv=5, 
            verbose=1, 
            return_train_score = True)

# Fit using grid search
gs_rf.fit(X_train, y_train)

# Best accuracy
print('Best accuracy: {:3f}'.format(gs_rf.best_score_))

# Best params
print('\nBest params:\n', gs_rf.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best accuracy: 0.705846

Best params:
 {'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__min_samples_leaf': 0.1, 'clf__min_samples_split': 0.05, 'clf__n_estimators': 100}


[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:  1.5min finished
