# Preprocessing and Model Selection

In [1]:
'''import needed'''
import pandas as pd
pd.set_option('mode.chained_assignment', None)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder



# Model Evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.neural_network import MLPClassifier


### Read Cleaned Data into Pandas

In [2]:
df = pd.read_csv('cleaned_data/games_1.csv', index_col = 0)
df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,team_away,score_home,score_away,winner,loser,...,score_difference,score_total,stadium,stadium_neutral,stadium_type,stadium_elevation,weather_temperature,weather_wind_mph,weather_detail,weather_humidity
170,1/15/1967,1966,22,True,Green Bay Packers,Kansas City Chiefs,35.0,10.0,Home,Away,...,-25.0,45.0,Los Angeles Memorial Coliseum,True,outdoor,29.6,54.0,7.0,No Precip,90.0
350,1/14/1968,1967,22,True,Green Bay Packers,Oakland Raiders,33.0,14.0,Home,Away,...,-19.0,47.0,Orange Bowl,True,outdoor,8.8,60.0,12.0,No Precip,74.0
538,1/12/1969,1968,22,True,Baltimore Colts,New York Jets,7.0,16.0,Away,Home,...,9.0,23.0,Orange Bowl,True,outdoor,8.8,66.0,12.0,rain,80.0
727,1/11/1970,1969,22,True,Kansas City Chiefs,Minnesota Vikings,23.0,7.0,Home,Away,...,-16.0,30.0,Tulane Stadium,True,outdoor,184.10755,55.0,14.0,rain,84.0
916,1/17/1971,1970,22,True,Baltimore Colts,Dallas Cowboys,16.0,13.0,Home,Away,...,-3.0,29.0,Orange Bowl,True,outdoor,8.8,59.0,11.0,No Precip,60.0


# Regression models to predict spread of games

## Separate the numeric and categorical features for preprocessing of Regression Models


### Create copy of dataframe and standardize the numeric features

In [3]:
df2 = df.copy()
df2 = df2[['schedule_season', 'schedule_week', 'schedule_playoff', 'team_home', 'team_away', 
        'score_home', 'score_away', 'stadium_neutral', 'stadium_type', 'weather_temperature', 
       'weather_detail', 'weather_wind_mph', 'score_difference']]

In [4]:
# Preprocessing Step
numeric_features2 = df2.select_dtypes(include=['int64','float64']).columns
categorical_features2 = df2.select_dtypes(exclude=['int64','float64']).columns



In [5]:
standardized_df2 = df2.copy()
# Standardize the numeric columns
scaler = StandardScaler()
standardized_df2[numeric_features2] = scaler.fit_transform(standardized_df2[numeric_features2])



In [6]:
standardized_df2.head()

Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,score_home,score_away,stadium_neutral,stadium_type,weather_temperature,weather_detail,weather_wind_mph,score_difference
170,-2.804535,2.331942,True,Green Bay Packers,Kansas City Chiefs,1.182043,-1.0024,True,outdoor,-0.412174,No Precip,-0.019619,-1.529757
350,-2.725739,2.331942,True,Green Bay Packers,Oakland Raiders,0.988887,-0.605162,True,outdoor,-0.003997,No Precip,0.89015,-1.118771
538,-2.646942,2.331942,True,Baltimore Colts,New York Jets,-1.522147,-0.406544,True,outdoor,0.40418,rain,0.89015,0.799168
727,-2.568146,2.331942,True,Kansas City Chiefs,Minnesota Vikings,0.023105,-1.300328,True,outdoor,-0.344145,rain,1.254058,-0.913277
916,-2.48935,2.331942,True,Baltimore Colts,Dallas Cowboys,-0.652943,-0.704472,True,outdoor,-0.072027,No Precip,0.708196,-0.022806


### Handle Categorical features for regression models, encode

In [7]:
encoded_df2 = pd.get_dummies(standardized_df2, columns=categorical_features2)
encoded_df2.head()


Unnamed: 0,schedule_season,schedule_week,score_home,score_away,weather_temperature,weather_wind_mph,score_difference,schedule_playoff_False,schedule_playoff_True,team_home_Arizona Cardinals,...,stadium_type_retractable,weather_detail_No Precip,weather_detail_fog,weather_detail_indoor,weather_detail_rain,weather_detail_rain | fog,weather_detail_retractable (open roof),weather_detail_snow,weather_detail_snow | Freezing rain,weather_detail_snow | fog
170,-2.804535,2.331942,1.182043,-1.0024,-0.412174,-0.019619,-1.529757,False,True,False,...,False,True,False,False,False,False,False,False,False,False
350,-2.725739,2.331942,0.988887,-0.605162,-0.003997,0.89015,-1.118771,False,True,False,...,False,True,False,False,False,False,False,False,False,False
538,-2.646942,2.331942,-1.522147,-0.406544,0.40418,0.89015,0.799168,False,True,False,...,False,False,False,False,True,False,False,False,False,False
727,-2.568146,2.331942,0.023105,-1.300328,-0.344145,1.254058,-0.913277,False,True,False,...,False,False,False,False,True,False,False,False,False,False
916,-2.48935,2.331942,-0.652943,-0.704472,-0.072027,0.708196,-0.022806,False,True,False,...,False,True,False,False,False,False,False,False,False,False


## Establish X,y/Independent and Dependent variables, regression

In [8]:
X_reg = encoded_df2.drop(['score_difference', 'score_home', 'score_away'], axis=1)

In [9]:
X_reg.head()

Unnamed: 0,schedule_season,schedule_week,weather_temperature,weather_wind_mph,schedule_playoff_False,schedule_playoff_True,team_home_Arizona Cardinals,team_home_Atlanta Falcons,team_home_Baltimore Colts,team_home_Baltimore Ravens,...,stadium_type_retractable,weather_detail_No Precip,weather_detail_fog,weather_detail_indoor,weather_detail_rain,weather_detail_rain | fog,weather_detail_retractable (open roof),weather_detail_snow,weather_detail_snow | Freezing rain,weather_detail_snow | fog
170,-2.804535,2.331942,-0.412174,-0.019619,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
350,-2.725739,2.331942,-0.003997,0.89015,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
538,-2.646942,2.331942,0.40418,0.89015,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
727,-2.568146,2.331942,-0.344145,1.254058,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
916,-2.48935,2.331942,-0.072027,0.708196,False,True,False,False,True,False,...,False,True,False,False,False,False,False,False,False,False


In [10]:
y_reg = encoded_df2['score_difference']

## Train/Test Split

In [11]:
# Split data into train/test sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size = 0.3, random_state = 42)

# Dummy Regressor 

In [12]:
dr = DummyRegressor(strategy="mean")
dr.fit(X_reg_train, y_reg_train)
y_dr_pred = dr.predict(X_reg_test) 


In [13]:
# Evaluate model performance 
print('Dummy Regression Model Performance')
print('MAE:', mean_absolute_error(y_reg_test, y_dr_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_reg_test, y_dr_pred)))
print('MSE:', mean_squared_error(y_reg_test, y_dr_pred))


Dummy Regression Model Performance
MAE: 0.7693004960849951
RMSE: 0.9896508068572772
MSE: 0.9794087195132598


# Random Forest Regression

In [14]:
# Create and train a random forest model
rf = RandomForestRegressor(random_state = 42)

rf.fit(X_reg_train, y_reg_train)
y_pred_rfr = rf.predict(X_reg_test)


In [15]:
print('Random Forest Regression Model Performance')
print('MAE:', mean_absolute_error(y_reg_test, y_pred_rfr))
print('RMSE:', np.sqrt(mean_squared_error(y_reg_test, y_pred_rfr)))
print('MSE:', mean_squared_error(y_reg_test, y_pred_rfr))


Random Forest Regression Model Performance
MAE: 0.7739113517547872
RMSE: 0.9862260946243877
MSE: 0.9726419097180717


## Linear Regression

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(X_reg_train, y_reg_train)
y_pred_lr = lr.predict(X_reg_test)

print('Random Forest Regression Model Performance')
print('MAE:', mean_absolute_error(y_reg_test, y_pred_lr))
print('RMSE:', np.sqrt(mean_squared_error(y_reg_test, y_pred_lr)))
print('MSE:', mean_squared_error(y_reg_test, y_pred_lr))

Random Forest Regression Model Performance
MAE: 0.7649861263439318
RMSE: 0.978142428864451
MSE: 0.9567626111448477


## Decision Tree Regression

In [18]:
dt = DecisionTreeRegressor()
dt.fit(X_reg_train, y_reg_train)
y_pred_dtr = dt.predict(X_reg_test) 

print('Random Forest Regression Model Performance')
print('MAE:', mean_absolute_error(y_reg_test, y_pred_dtr))
print('RMSE:', np.sqrt(mean_squared_error(y_reg_test, y_pred_dtr)))
print('MSE:', mean_squared_error(y_reg_test, y_pred_dtr))

Random Forest Regression Model Performance
MAE: 1.0836845124731305
RMSE: 1.3703842365366423
MSE: 1.8779529557481163


There is not a large difference between the dummy regressor and the regression models, except the decision tree that performs very poorly. More features would be need to properly predict the spread. This makes sense. The outcome of a team is essentially an aggregate of the players performance 

# Classifier


In [19]:

standardized_df = df.copy()
standardized_df = df[['schedule_season', 'schedule_week', 'schedule_playoff', 'team_home', 'team_away', 
         'winner', 'stadium_neutral', 'stadium_type', 'weather_temperature', 
       'weather_detail', 'weather_wind_mph']]

numeric_features = standardized_df.select_dtypes(include=['int64','float64']).columns

categorical_features = standardized_df.select_dtypes(exclude=['int64','float64']).columns

# Standardize the numeric columns
scaler = StandardScaler()
standardized_df[numeric_features] = scaler.fit_transform(standardized_df[numeric_features])

# Now your numeric columns are standardized, excluding the specified columns
standardized_df.head()


Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,winner,stadium_neutral,stadium_type,weather_temperature,weather_detail,weather_wind_mph
170,-2.804535,2.331942,True,Green Bay Packers,Kansas City Chiefs,Home,True,outdoor,-0.412174,No Precip,-0.019619
350,-2.725739,2.331942,True,Green Bay Packers,Oakland Raiders,Home,True,outdoor,-0.003997,No Precip,0.89015
538,-2.646942,2.331942,True,Baltimore Colts,New York Jets,Away,True,outdoor,0.40418,rain,0.89015
727,-2.568146,2.331942,True,Kansas City Chiefs,Minnesota Vikings,Home,True,outdoor,-0.344145,rain,1.254058
916,-2.48935,2.331942,True,Baltimore Colts,Dallas Cowboys,Home,True,outdoor,-0.072027,No Precip,0.708196


In [20]:
# Creating a instance of label Encoder.
le = LabelEncoder()

for col in standardized_df:
    label = le.fit_transform(standardized_df[col])
    standardized_df[col] = label
 
standardized_df.head()

Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,winner,stadium_neutral,stadium_type,weather_temperature,weather_detail,weather_wind_mph
170,0,21,1,12,17,1,1,1,56,0,7
350,1,21,1,12,28,1,1,1,62,0,13
538,2,21,1,2,27,0,1,1,69,3,13
727,3,21,1,17,23,1,1,1,57,3,15
916,4,21,1,2,9,1,1,1,61,0,12


In [21]:
X_cat = standardized_df.drop(columns=['winner'])

In [22]:
X_cat.head()

Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,stadium_neutral,stadium_type,weather_temperature,weather_detail,weather_wind_mph
170,0,21,1,12,17,1,1,56,0,7
350,1,21,1,12,28,1,1,62,0,13
538,2,21,1,2,27,1,1,69,3,13
727,3,21,1,17,23,1,1,57,3,15
916,4,21,1,2,9,1,1,61,0,12


In [23]:
y_cat = standardized_df['winner']
y_cat.head

<bound method NDFrame.head of 170      1
350      1
538      0
727      1
916      1
        ..
13589    0
13590    0
13591    1
13592    1
13593    1
Name: winner, Length: 11098, dtype: int64>

In [24]:
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y_cat, test_size=0.3, random_state=42 )

# Dummy Classifier

In [25]:
#Establish Classifier Model
dr = DummyClassifier(strategy="stratified", random_state=42)
#Fit model using training sets
dr.fit(X_cat_train, y_cat_train)
#Predict
y_cat_pred = dr.predict(X_cat_test) 



In [26]:
# Evaluate model performance 
print('DUMMY Classifier Model Performance')
print('MAE:', mean_absolute_error(y_cat_test, y_cat_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_cat_test, y_cat_pred)))
print('MSE:', mean_squared_error(y_cat_test, y_cat_pred))
print('Accuracy:', accuracy_score(y_cat_test, y_cat_pred))

DUMMY Classifier Model Performance
MAE: 0.4885885885885886
RMSE: 0.7015641041217774
MSE: 0.4921921921921922
Accuracy: 0.5132132132132132


In [27]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_cat_train, y_cat_train)

y_pred_rf_clf = dr.predict(X_cat_test) 

In [28]:
# Evaluate model performance 
print('Random Forest Classifier Model Performance')
print('MAE:', mean_absolute_error(y_cat_test, y_pred_rf_clf))
print('RMSE:', np.sqrt(mean_squared_error(y_cat_test, y_pred_rf_clf)))
print('MSE:', mean_squared_error(y_cat_test, y_pred_rf_clf))
print('Accuracy:', accuracy_score(y_cat_test, y_pred_rf_clf))

Random Forest Classifier Model Performance
MAE: 0.4885885885885886
RMSE: 0.7015641041217774
MSE: 0.4921921921921922
Accuracy: 0.5132132132132132


Not much better than the dummy classifier. No need to do parameter tuning due to poor performance. 

In [29]:
dt_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
dt_clf.fit(X_cat_train, y_cat_train)
y_pred_dt_clf = dt_clf.predict(X_cat_test)


In [30]:
# Evaluate model performance 
print('Decision Tree Classifier Model Performance')
print('MAE:', mean_absolute_error(y_cat_test, y_pred_dt_clf))
print('RMSE:', np.sqrt(mean_squared_error(y_cat_test, y_pred_dt_clf)))
print('MSE:', mean_squared_error(y_cat_test, y_pred_dt_clf))
print('Accuracy:', accuracy_score(y_cat_test, y_pred_dt_clf))

Decision Tree Classifier Model Performance
MAE: 0.43633633633633634
RMSE: 0.6605575950182817
MSE: 0.43633633633633634
Accuracy: 0.5636636636636637


The dummy classifier has better performance. No need to do parameter tuning due to poor performance. 

In [32]:
 NN_clf = MLPClassifier(solver='adam', alpha=1e-5, 
    hidden_layer_sizes=(6, 3), random_state=1)
NN_clf.fit(X_cat_train, y_cat_train)


In [33]:
y_pred_NN = NN_clf.predict(X_cat_test)

In [34]:
# Evaluate model performance 
print('Neural Net Classifier Model Performance')
print('MAE:', mean_absolute_error(y_cat_test, y_pred_NN))
print('RMSE:', np.sqrt(mean_squared_error(y_cat_test, y_pred_NN)))
print('MSE:', mean_squared_error(y_cat_test, y_pred_NN))
print('Accuracy:', accuracy_score(y_cat_test, y_pred_NN))

Neural Net Classifier Model Performance
MAE: 0.43123123123123125
RMSE: 0.6566819863763824
MSE: 0.43123123123123125
Accuracy: 0.5687687687687688


Not much better than the dummy classifier. No need to do parameter tuning due to poor performance. 

None of the models outperform the dummy models. Additional features are needed from other data. Team performance is essentially an aggregate of play performance. Looking at the individual players performance and injury status would be a good start, espececially with the transient nature of team makeup in the NFL. Coaches and coach performance would also be a good dataset to integrate. 