In [1]:
# Import dependencies
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

In [24]:
# Read data
team_stats_2019 = pd.read_csv('https://storage.googleapis.com/big-data-bowl/2019-team-stats.csv', encoding='latin')
team_stats_2020 = pd.read_csv('https://storage.googleapis.com/big-data-bowl/2020-team-stats.csv', encoding='latin')
game_results_2020 = pd.read_csv('https://storage.googleapis.com/big-data-bowl/2020-game-results.csv', encoding='latin')
game_results_2021 = pd.read_csv('https://storage.googleapis.com/big-data-bowl/2021-game-results.csv', encoding='latin')

games = pd.read_csv('https://storage.googleapis.com/big-data-bowl/games.csv')

In [3]:
# Define team abbreviations
stats_team_abbr_dict = {
    'TB': 'Tampa Bay Buccaneers',
    'ATL': 'Atlanta Falcons',
    'BUF': 'Buffalo Bills',
    'CAR': 'Carolina Panthers',
    'CIN': 'Cincinnati Bengals',
    'DET': 'Detroit Lions',
    'HOU': 'Houston Texans',
    'IND': 'Indianapolis Colts',
    'TEN': 'Tennessee Titans',
    'WAS': 'Washington Redskins',
    'KC': 'Kansas City Chiefs',
    'NE': 'New England Patriots',
    'NO': 'New Orleans Saints',
    'NYG': 'New York Giants',
    'LA': 'Los Angeles Rams',
    'LV': 'Oakland Raiders',
    'CHI': 'Chicago Bears',
    'CLE': 'Cleveland Browns',
    'JAX': 'Jacksonville Jaguars',
    'MIA': 'Miami Dolphins',
    'NYJ': 'New York Jets',
    'PHI': 'Philadelphia Eagles',
    'PIT': 'Pittsburgh Steelers',
    'ARI': 'Arizona Cardinals',
    'LAC': 'Los Angeles Chargers',
    'SEA': 'Seattle Seahawks',
    'BAL': 'Baltimore Ravens',
    'GB': 'Green Bay Packers',
    'DEN': 'Denver Broncos',
    'MIN': 'Minnesota Vikings',
    'SF': 'San Francisco 49ers',
    'DAL': 'Dallas Cowboys'
}


In [4]:
# Clean team columns for matchup prediction DataFrame
matchups = games.copy()
games['homeTeam'] = games['homeTeamAbbr'].apply(lambda x: stats_team_abbr_dict[x])
games['visitorTeam'] = games['visitorTeamAbbr'].apply(lambda x: stats_team_abbr_dict[x])

matchups = games[['homeTeam', 'visitorTeam']]

In [5]:
# Create training data
training_data = game_results_2020.merge(
    team_stats_2019,
    left_on=['Home'],
    right_on=['Team']
)

training_data = training_data.merge(
    team_stats_2019,
    left_on=['Visitor'],
    right_on=['Team'],
    suffixes=('_home', '_visitor')
)

training_data.drop(['Visitor', 'Home', 'Team_visitor', 'Team_home'], axis=1, inplace=True)

In [13]:
# Create prediction data
prediction_data = games[['homeTeam', 'visitorTeam']].copy()

prediction_data = prediction_data.merge(
    team_stats_2020,
    left_on=['homeTeam'],
    right_on=['Team']
)

prediction_data = prediction_data.merge(
    team_stats_2019,
    left_on=['visitorTeam'],
    right_on=['Team'],
    suffixes=('_home', '_visitor')
)

prediction_data.drop(['visitorTeam', 'homeTeam'], axis=1, inplace=True)

In [7]:
# Create home model
X_home = training_data.drop(['Visitor Score', 'Home Score'], axis=1)
y_home = training_data['Home Score']

home_model = LinearRegression()
home_model.fit(X_home, y_home)

score = home_model.score(X_home, y_home)
print(f"R2 Score: {score}")

R2 Score: 0.14000259536850868


In [8]:
# Create visitor model
X_visitor = training_data.drop(['Visitor Score', 'Home Score'], axis=1)
y_visitor = training_data['Visitor Score']

visitor_model = LinearRegression()
visitor_model.fit(X_visitor, y_visitor)

score = visitor_model.score(X_visitor, y_visitor)
print(f"R2 Score: {score}")

R2 Score: 0.1341448312461433


In [25]:
# Predict 2021 scores
home_predictions = home_model.predict(prediction_data.drop(['Team_visitor', 'Team_home'], axis=1))
visitor_predictions = visitor_model.predict(prediction_data.drop(['Team_visitor', 'Team_home'], axis=1))

predicted_data = prediction_data.copy()

predicted_data['predictedHomeScore'] = home_predictions
predicted_data['predictedVisitorScore'] = visitor_predictions

predicted_data['homeWin'] = predicted_data.apply(lambda r: r['predictedHomeScore'] - r['predictedVisitorScore'] > 1, axis=1)

In [28]:
game_results_2021['homeWin'] = game_results_2021.apply(lambda r: r['Home Score'] - r['Visitor Score'] > 1, axis=1)

In [36]:
prediction_results = predicted_data.merge(
    game_results_2021,
    left_on=['Team_home', 'Team_visitor'],
    right_on=['Home', 'Visitor'],
    suffixes=('_pred', '_actual')
)

prediction_results = prediction_results[['Visitor', 'Home', 'homeWin_pred', 'homeWin_actual']]
prediction_results['correct'] = prediction_results.apply(lambda r: int(r['homeWin_pred'] == r['homeWin_actual']), axis=1)

prediction_results['correct'].mean()

0.6635514018691588