In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
!ls

'NFL Gambling Modeling.ipynb'		 nfl_teams.csv	 spreadspoke_scores.csv
'NFL Gambling Pre-Modeling Work.ipynb'	 pi.csv		 Untitled.ipynb
 nfl_stadiums.csv			 spreadspoke.R


In [3]:
!ls../

/bin/bash: line 1: ls../: No such file or directory


In [4]:
!ls ../Cooper

Untitled.ipynb


In [6]:
df = pd.read_csv('../Cooper/final_nfl_data.csv')
df_1 = pd.read_csv('../Tom/pi.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../Cooper/final_nfl_data.csv'

In [None]:
df.drop(columns='Unnamed: 0', inplace=True)

In [None]:
df

In [None]:
X = df.drop(columns='favorite_win')
y = df['favorite_win']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=520, random_state=42)


In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
train_df = X_train.copy()
train_df['favorite_win'] = y_train

In [None]:
sns.heatmap(train_df.corr());

In [None]:
list(X_train)

In [None]:
num_columns = list(X_train)

In [None]:
num_transformer = StandardScaler()

preprocessor = ColumnTransformer([ 
    ('nums', num_transformer, num_columns)
])

# Model-less Baseline

In [None]:
pd.Series(y_train).value_counts(normalize=True)

# Decision Tree

In [None]:
clf_dt = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(random_state=42, max_depth=15))
])

clf_dt.fit(X_train, y_train)

train_preds = clf_dt.predict(X_train)
test_preds = clf_dt.predict(X_test)

train_probas = clf_dt.predict_proba(X_train)[:,1]
test_probas = clf_dt.predict_proba(X_test)[:,1]

In [None]:
plot_confusion_matrix(clf_dt, X_test, y_test);

In [None]:
plot_roc_curve(clf_dt, X_test, y_test);

In [None]:
print(f"Train Accuracy: {accuracy_score(y_train, train_preds):.4f}")
print(f"Train F1: {f1_score(y_train, train_preds):.4f}")
print(f"Train Precision: {precision_score(y_train, train_preds):.4f}")
print(f"Train ROC-AUC: {roc_auc_score(y_train, train_preds):.4f}")
print("*" * 20)
print(f"Test Accuracy: {accuracy_score(y_test, test_preds):.4f}")
print(f"Test F1: {f1_score(y_test, test_preds):.4f}")
print(f"Test Precision: {precision_score(y_test, test_preds):.4f}")
print(f"Test ROC-AUC: {roc_auc_score(y_test, test_preds):.4f}")

In [None]:
def evaluate(model, Xtr, Xte, ytr, yte):
    train_preds = model.predict(Xtr)
    test_preds = model.predict(Xte)

    train_probas = model.predict_proba(Xtr)[:,1]
    test_probas = model.predict_proba(Xte)[:,1]
    
    plot_confusion_matrix(model, Xte, yte)
    
    plot_roc_curve(model, Xte, yte)
    
    print(f"Train F1: {f1_score(ytr, train_preds):.4f}")
    print(f"Train Precision: {precision_score(ytr, train_preds):.4f}")
    print(f"Train ROC-AUC: {roc_auc_score(ytr, train_preds):.4f}")
    print("*" * 20)
    print(f"Test F1: {f1_score(yte, test_preds):.4f}")
    print(f"Test Precision: {precision_score(yte, test_preds):.4f}")
    print(f"Test ROC-AUC: {roc_auc_score(yte, test_preds):.4f}")

In [None]:
evaluate(clf_dt, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
ada = AdaBoostClassifier(random_state=42, learning_rate=.1, n_estimators=100)

ada.fit(X_train, y_train)

In [None]:
print(f"Train Score: {ada.score(X_train, y_train)}")
print(f"Test Score: {ada.score(X_test, y_test)}")

In [None]:
ada = AdaBoostClassifier(random_state = 65)

# Let's define a param grid together!
param_grid = {
        'learning_rate': [.01, .1, .5, 1],
        'n_estimators': [10, 50, 100, 1000]
}

# Create our grid search
gs = GridSearchCV(estimator=ada, param_grid=param_grid, cv = 5)

# Fit our grid search
gs.fit(X_train, y_train)

In [None]:
print(f"Train Score: {gs.best_estimator_.score(X_train, y_train)}")
print(f"Test Score: {gs.best_estimator_.score(X_test, y_test)}")

In [None]:
gs.best_params_

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB

In [None]:
clf_nb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('nb', GaussianNB())
])

clf_nb.fit(X_train, y_train)

In [None]:
evaluate(clf_nb, X_train, X_test, y_train, y_test)

# Logreg

In [None]:
clf_lr = Pipeline(steps = [
    
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression())
])
clf_lr.fit(X_train, y_train)

In [None]:
evaluate(clf_lr, X_train, X_test, y_train, y_test)

# DT Exploration

In [None]:
clf_dt = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(random_state=42, max_depth=15))
])

clf_dt.fit(X_train, y_train)

train_preds = clf_dt.predict(X_train)
test_preds = clf_dt.predict(X_test)

train_probas = clf_dt.predict_proba(X_train)[:,1]
test_probas = clf_dt.predict_proba(X_test)[:,1]

In [None]:
def evaluate(model, Xtr, Xte, ytr, yte):
    train_preds = model.predict(Xtr)
    test_preds = model.predict(Xte)

    train_probas = model.predict_proba(Xtr)[:,1]
    test_probas = model.predict_proba(Xte)[:,1]
    
    plot_confusion_matrix(model, Xte, yte)
    
    plot_roc_curve(model, Xte, yte)
    
    print(f"Train F1: {f1_score(ytr, train_preds):.4f}")
    print(f"Train Precision: {precision_score(ytr, train_preds):.4f}")
    print(f"Train ROC-AUC: {roc_auc_score(ytr, train_preds):.4f}")
    print("*" * 20)
    print(f"Test F1: {f1_score(yte, test_preds):.4f}")
    print(f"Test Precision: {precision_score(yte, test_preds):.4f}")
    print(f"Test ROC-AUC: {roc_auc_score(yte, test_preds):.4f}")

In [None]:
evaluate(clf_dt, X_train, X_test, y_train, y_test)

## Boosting Methods

### ADABoost

In [None]:
ada = AdaBoostClassifier(random_state = 65)

# Let's define a param grid together!
param_grid = {
        'learning_rate': [.001, .01, .05, .1,],
        'n_estimators': [2, 5, 10, 50]
}

# Create our grid search
gs = GridSearchCV(estimator=ada, param_grid=param_grid, cv = 5)

# Fit our grid search
gs.fit(X_train, y_train)

In [None]:
print(f"Train Score: {gs.best_estimator_.score(X_train, y_train)}")
print(f"Test Score: {gs.best_estimator_.score(X_test, y_test)}")

In [None]:
gs.best_params_

### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb_sklearn = GradientBoostingClassifier(max_depth=2, n_estimators=10, random_state=42)

gb_sklearn.fit(X_train, y_train)

In [None]:
param_grid = {
        'learning_rate': [.001, .01, .05, .1,],
        'n_estimators': [2, 5, 10, 50]
}

gs = GridSearchCV(estimator=gb_sklearn, param_grid=param_grid, cv = 5)

gs.fit(X_train, y_train)

In [None]:
print(f"Train Score: {gs.score(X_train, y_train)}")
print(f"Test Score: {gs.score(X_test, y_test)}")

In [None]:
gs.best_params_