In [5]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)



from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
df = pd.read_csv('../Data/finalDataFrame.csv')

In [3]:
#creating features and results dataframes
X = df.drop('team1Win', axis=1)
y = df.team1Win

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [10]:
#initialize dictionary of pipelines
pipelines = {
    #sample entry
    #'lasso':make_pipeline(SStandardScaler(), lasso(random_state=1))
    # 'svm': make_pipeline(StandardScaler(), SVC()),
    # 'knn': make_pipeline(StandardScaler(), KNeighborsClassifier()),
    'l1' : make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', random_state=123)),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression(random_state=123)),
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
}

#indvidual hyperparameter dictionaries
svc_hyperparameters = {
    'svc__C':[0.1, 1, 10, 100, 1000],
    'svc__gamma':[1,0.1,0.01, 0.001, 0.0001]
}


knn_hyperparameters = {
    'kneighborsclassifier__n_neighbors': list(range(1,31)),
    'kneighborsclassifier__weights': ['uniform','distance']
}

l1_hyperparameters = {
    'logisticregression__C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
}

l2_hyperparameters = {
    'logisticregression__C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
}

rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [100,200],
    'randomforestclassifier__max_features' : ['auto', 'sqrt', 0.33],
    'randomforestclassifier__min_samples_leaf' : [1,3,5,10]
}

gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [100,200],
    'gradientboostingclassifier__learning_rate' : [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth' : [1,3,5]
}


#initialize dictionaries of hyperparameters
#must use same keys as pipelines
hyperparameters = {
    'svm':svc_hyperparameters,
    'knn':knn_hyperparameters,
    'l1' : l1_hyperparameters,
    'l2' : l2_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters
}

In [11]:
#print(pipelines['knn'].get_params())

In [12]:
#fitted models
fitted = {}
#setup for loop for training best models
for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1)
    model.fit(X_train, y_train)
    fitted[name] = model
    print('{} has been fitted'.format(name))

l1 has been fitted
l2 has been fitted
rf has been fitted
gb has been fitted


In [18]:
#checking for performance
from sklearn.metrics import r2_score, mean_absolute_error

#see performances on training set and test set
for name, model in fitted.items():
    print(name, 'train:', model.best_score_)

for name, model in fitted.items():

    pred = fitted[name].predict(X_test)
    mat = confusion_matrix(y_test, pred)
    print(mat)

    pred_scores = fitted[name].predict_proba(X_test)
    pred_scores = [p[1] for p in pred_scores]
    print(name, 'roc score:', roc_auc_score(y_test,pred_scores))

    print()


l1 train: 0.564625
l2 train: 0.545625
rf train: 0.5740000000000001
gb train: 0.553875
[[632 377]
 [445 546]]
l1 roc score: 0.6157298741198037

[[554 455]
 [455 536]]
l2 roc score: 0.5578831885382716

[[627 382]
 [433 558]]
rf roc score: 0.618514099642071

[[623 386]
 [450 541]]
gb roc score: 0.6120265741525064



In [65]:
    print(name, 'test r2_score:', r2_score(y_test, pred))
    print(name, 'test MAE:', mean_absolute_error(y_test, pred))
    print()

knn test r2_score: -0.9681594209130937
knn test MAE: 0.492

