In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import os

In [15]:
team_dict = {'Arizona Cardinals': 'ARI',
 'Atlanta Falcons': 'ATL',
 'Baltimore Colts': 'IND',
 'Baltimore Ravens': 'BAL',
 'Boston Patriots': 'NE',
 'Buffalo Bills': 'BUF',
 'Carolina Panthers': 'CAR',
 'Chicago Bears': 'CHI',
 'Cincinnati Bengals': 'CIN',
 'Cleveland Browns': 'CLE',
 'Dallas Cowboys': 'DAL',
 'Denver Broncos': 'DEN',
 'Detroit Lions': 'DET',
 'Green Bay Packers': 'GB',
 'Houston Oilers': 'TEN',
 'Houston Texans': 'HOU',
 'Indianapolis Colts': 'IND',
 'Jacksonville Jaguars': 'JAX',
 'Kansas City Chiefs': 'KC',
 'Las Vegas Raiders': 'LVR',
 'Los Angeles Chargers': 'LAC',
 'Los Angeles Raiders': 'LVR',
 'Los Angeles Rams': 'LAR',
 'Miami Dolphins': 'MIA',
 'Minnesota Vikings': 'MIN',
 'New England Patriots': 'NE',
 'New Orleans Saints': 'NO',
 'New York Giants': 'NYG',
 'New York Jets': 'NYJ',
 'Oakland Raiders': 'LVR',
 'Philadelphia Eagles': 'PHI',
 'Phoenix Cardinals': 'ARI',
 'Pittsburgh Steelers': 'PIT',
 'San Diego Chargers': 'LAC',
 'San Francisco 49ers': 'SF',
 'Seattle Seahawks': 'SEA',
 'St. Louis Cardinals': 'ARI',
 'St. Louis Rams': 'LAR',
 'Tampa Bay Buccaneers': 'TB',
 'Tennessee Oilers': 'TEN',
 'Tennessee Titans': 'TEN',
 'Washington Commanders': 'WAS',
 'Washington Football Team': 'WAS',
 'Washington Redskins': 'WAS'}

filename = 'nfl_teams.json'

with open(filename, 'w') as f:
    json.dump(team_dict, f)

In [16]:
directory_path = os.path.join('nfldata', 'betting_odds')
file_path = os.path.join(directory_path, 'games_1967_present.csv')
games = pd.read_csv(file_path)
games = games.loc[games['schedule_season'] >= 2002]

## Baseline Model I: Pick the Home Team

In [17]:
len(games.loc[games['team_favorite_id'] == 'PICK'])
# len(games.loc[games['team_favorite_id'] != 'PICK'])

34

There are 34 games from the 2002 to present where neither team is favorited and the line is 0.

In [18]:
homes, aways = [], []
games = games.loc[games['team_favorite_id'] != 'PICK']
for i in range(len(games)):
    game = games.iloc[i]
    homes.append(team_dict[game['team_home']])
    aways.append(team_dict[game['team_away']])
games['home'] = homes
games['away'] = aways

def determine_spread_winner(row):
    favorite_team = row['team_favorite_id']
    home_team = row['home']
    away_team = row['away']
    score_home = row['score_home']
    score_away = row['score_away']
    spread = row['spread_favorite']
    if favorite_team == home_team:
            adjusted_score_home = score_home + spread
            adjusted_score_away = score_away
    else:
        adjusted_score_home = score_home
        adjusted_score_away = score_away + spread

    # Determine the winner against the spread
    if adjusted_score_home == adjusted_score_away:
        return 'TIE'
    elif adjusted_score_home > adjusted_score_away:
        return home_team
    else:
        return away_team
        
games['spread_cover'] = games.apply(determine_spread_winner, axis=1)
ties = (games['spread_cover'] == 'TIE').sum()
home_wins = (games['spread_cover'] == games['home']).sum()
away_wins = (games['spread_cover'] == games['away']).sum()
favorite_wins = (games['spread_cover'] == games['team_favorite_id']).sum()
num_games = (len(games) - ties)

In [19]:
home_win_percentage = home_wins/num_games
away_win_percentage = away_wins/num_games
favorite_win_percentage = favorite_wins/num_games
underdog_win_percentage = 1 - favorite_win_percentage
print('Win Percentages for Covering Spread 2002-Present:')
print(f'Home Win Percentage: {"{:.4%}".format(home_win_percentage)}')
print(f'Away Win Percentage: {"{:.4%}".format(away_win_percentage)}')
print(f'Favorite Win Percentage: {"{:.4%}".format(favorite_win_percentage)}')
print(f'Underdog Win Percentage: {"{:.4%}".format(underdog_win_percentage)}')

Win Percentages for Covering Spread 2002-Present:
Home Win Percentage: 48.8559%
Away Win Percentage: 51.1441%
Favorite Win Percentage: 48.5764%
Underdog Win Percentage: 51.4236%


## Baseline Model II: Logistic Regression:

In [20]:
# Import packages for logistic regression in order to determine who will cover the spread
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [21]:
# Display games dataframe
games.head(10)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,home,away,spread_cover
7872,9/5/2002,2002,1,False,New York Giants,13,16,San Francisco 49ers,SF,-4.0,39.0,Giants Stadium,False,75.0,12.0,58.0,,NYG,SF,NYG
7873,9/8/2002,2002,1,False,Buffalo Bills,31,37,New York Jets,NYJ,-3.0,43.0,Ralph Wilson Stadium,False,75.0,7.0,50.0,,BUF,NYJ,NYJ
7875,9/8/2002,2002,1,False,Chicago Bears,27,23,Minnesota Vikings,CHI,-4.5,41.0,Memorial Stadium (Champaign),False,76.0,5.0,75.0,,CHI,MIN,MIN
7876,9/8/2002,2002,1,False,Cincinnati Bengals,6,34,San Diego Chargers,CIN,-3.0,37.0,Paul Brown Stadium,False,81.0,5.0,50.0,,CIN,LAC,LAC
7877,9/8/2002,2002,1,False,Cleveland Browns,39,40,Kansas City Chiefs,CLE,-2.0,36.0,FirstEnergy Stadium,False,78.0,7.0,54.0,,CLE,KC,KC
7878,9/8/2002,2002,1,False,Denver Broncos,23,16,St. Louis Rams,LAR,-3.0,51.0,Sports Authority Field at Mile High,False,73.0,13.0,45.0,,DEN,LAR,DEN
7879,9/8/2002,2002,1,False,Green Bay Packers,37,34,Atlanta Falcons,GB,-7.0,42.5,Lambeau Field,False,72.0,6.0,78.0,,GB,ATL,ATL
7880,9/8/2002,2002,1,False,Houston Texans,19,10,Dallas Cowboys,DAL,-8.5,33.5,Reliant Stadium,False,72.0,0.0,,indoor,HOU,DAL,HOU
7881,9/8/2002,2002,1,False,Jacksonville Jaguars,25,28,Indianapolis Colts,IND,-3.5,44.0,EverBank Field,False,82.0,14.0,77.0,,JAX,IND,JAX
7882,9/8/2002,2002,1,False,Miami Dolphins,49,21,Detroit Lions,MIA,-9.5,35.5,Sun Life Stadium,False,83.0,9.0,80.0,,MIA,DET,MIA


In [22]:
def determine_binary_spread_winner(row):
    """
    return 1 if the favorite covers the spread, 0 otherwise
    """
    favorite_team = row['team_favorite_id']
    home_team = row['home']
    away_team = row['away']
    score_home = row['score_home']
    score_away = row['score_away']
    spread = row['spread_favorite']
    
    if favorite_team == home_team:
        adjusted_score_favorite = score_home + spread
        adjusted_score_underdog = score_away
    else:
        adjusted_score_underdog = score_home
        adjusted_score_favorite = score_away + spread

    # Determine the winner against the spread
    if adjusted_score_underdog == adjusted_score_favorite:
        return 2
    elif adjusted_score_favorite > adjusted_score_underdog:
        return 1
    else:
        return 0
        

In [23]:

spread_data = games[['team_home', 'team_away', 'team_favorite_id', 'spread_favorite', 'score_home', 'score_away', 'weather_temperature']]
spread_data = spread_data.dropna()

homes, aways = [], []
spread_data = spread_data.loc[games['team_favorite_id'] != 'PICK']

for i in range(len(spread_data)):
    spread = spread_data.iloc[i]
    homes.append(team_dict[spread['team_home']])
    aways.append(team_dict[spread['team_away']])
spread_data['home'] = homes
spread_data['away'] = aways


# Create a new dataframe with the columns we need
spread_data['spread_cover'] = spread_data.apply(determine_binary_spread_winner, axis=1)
spread_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4933 entries, 7872 to 13800
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   team_home            4933 non-null   object 
 1   team_away            4933 non-null   object 
 2   team_favorite_id     4933 non-null   object 
 3   spread_favorite      4933 non-null   float64
 4   score_home           4933 non-null   int64  
 5   score_away           4933 non-null   int64  
 6   weather_temperature  4933 non-null   float64
 7   home                 4933 non-null   object 
 8   away                 4933 non-null   object 
 9   spread_cover         4933 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 423.9+ KB


In [24]:
spread_data_no_ties = spread_data.loc[spread_data['spread_cover'] != 2]
spread_data_no_ties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4798 entries, 7872 to 13800
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   team_home            4798 non-null   object 
 1   team_away            4798 non-null   object 
 2   team_favorite_id     4798 non-null   object 
 3   spread_favorite      4798 non-null   float64
 4   score_home           4798 non-null   int64  
 5   score_away           4798 non-null   int64  
 6   weather_temperature  4798 non-null   float64
 7   home                 4798 non-null   object 
 8   away                 4798 non-null   object 
 9   spread_cover         4798 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 412.3+ KB


In [25]:
# Calculate the win percentages for the favorite and underdog
favorite_wins = (spread_data_no_ties['spread_cover'] == 1).sum()
underdog_wins = (spread_data_no_ties['spread_cover'] == 0).sum()
num_games = len(spread_data_no_ties)

favorite_win_percentage = favorite_wins/num_games
underdog_win_percentage = underdog_wins/num_games

print('Win Percentages for Covering Spread 2002-Present:')
print(f'Favorite Win Percentage: {"{:.4%}".format(favorite_win_percentage)}')
print(f'Underdog Win Percentage: {"{:.4%}".format(underdog_win_percentage)}')


Win Percentages for Covering Spread 2002-Present:
Favorite Win Percentage: 48.6661%
Underdog Win Percentage: 51.3339%


In [26]:
# Perform Logistic Regression on the data

# Create Logistic Regression Model
X = spread_data_no_ties.drop(['spread_cover', 'score_home', 'score_away'], axis=1)
y = spread_data_no_ties['spread_cover']

X = pd.get_dummies(X, drop_first=True)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

logreg = LogisticRegression(max_iter=1000, solver='lbfgs')
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Display the classification report
print(classification_report(y_test, y_pred))

# Display the coefficients of the model
coefficients = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(np.transpose(logreg.coef_))], axis = 1)
print(coefficients)

# Display the accuracy of the model
accuracy = logreg.score(X_test, y_test)
print(f'Accuracy: {"{:.4%}".format(accuracy)}')

X.head()

[[296 196]
 [266 202]]
              precision    recall  f1-score   support

           0       0.53      0.60      0.56       492
           1       0.51      0.43      0.47       468

    accuracy                           0.52       960
   macro avg       0.52      0.52      0.51       960
weighted avg       0.52      0.52      0.52       960

                              0         0
0               spread_favorite  0.026650
1           weather_temperature -0.004853
2     team_home_Atlanta Falcons  0.023141
3    team_home_Baltimore Ravens  0.068808
4       team_home_Buffalo Bills -0.096544
..                          ...       ...
162                    away_SEA -0.053414
163                     away_SF -0.089335
164                     away_TB -0.166705
165                    away_TEN -0.023164
166                    away_WAS  0.498833

[167 rows x 2 columns]
Accuracy: 51.8750%


Unnamed: 0,spread_favorite,weather_temperature,team_home_Atlanta Falcons,team_home_Baltimore Ravens,team_home_Buffalo Bills,team_home_Carolina Panthers,team_home_Chicago Bears,team_home_Cincinnati Bengals,team_home_Cleveland Browns,team_home_Dallas Cowboys,...,away_NO,away_NYG,away_NYJ,away_PHI,away_PIT,away_SEA,away_SF,away_TB,away_TEN,away_WAS
7872,-4.0,75.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7873,-3.0,75.0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7875,-4.5,76.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7876,-3.0,81.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7877,-2.0,78.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Create a test dataframe with a hand-made point from a game not in the original data to test the model

