In [84]:
import pandas as pd

In [85]:
matches = pd.read_csv("Regular_season_data.csv", index_col = 0)
#Removing any cancelled matches 
matches = matches[~matches.apply(lambda row: row.astype(str).str.contains('Canceled', case=False, na=False)).any(axis=1)]

In [86]:
# Renaming unnamed columns
matches.rename(columns={'unnamed: 3_level_1' : 'time', 'unnamed: 5_level_1' : 'result', 'unnamed: 8_level_1' : 'venue'}, inplace=True)

In [87]:
# Dictionary mapping abbreviations to full team names
team_mapping = {
    "atl": "Atlanta Falcons",
    "crd": "Arizona Cardinals",
    "buf": "Buffalo Bills",
    "rav": "Baltimore Ravens",
    "car": "Carolina Panthers",
    "chi": "Chicago Bears",
    "cin": "Cincinnati Bengals",
    "cle": "Cleveland Browns",
    "dal": "Dallas Cowboys",
    "den": "Denver Broncos",
    "det": "Detroit Lions",
    "gnb": "Green Bay Packers",
    "htx": "Houston Texans",
    "clt": "Indianapolis Colts",
    "jax": "Jacksonville Jaguars",
    "kan": "Kansas City Chiefs",
    "rai": "Las Vegas Raiders",
    "sdg": "Los Angeles Chargers",
    "ram": "Los Angeles Rams",
    "mia": "Miami Dolphins",
    "min": "Minnesota Vikings",
    "nwe": "New England Patriots",
    "nor": "New Orleans Saints",
    "nyg": "New York Giants",
    "nyj": "New York Jets",
    "phi": "Philadelphia Eagles",
    "pit": "Pittsburgh Steelers",
    "sfo": "San Francisco 49ers",
    "sea": "Seattle Seahawks",
    "tam": "Tampa Bay Buccaneers",
    "oti": "Tennessee Titans",
    "was": "Washington Commanders"
}

# Mapping full_name in the 'team' column
matches['team'] = matches['team'].str.lower().replace(team_mapping)



In [88]:
#Making a day-month-year column
matches['overall_date'] = matches['date'] + '-' + matches['season'].astype(str)
#Converting to date_time object
matches["overall_date"] = pd.to_datetime(matches["overall_date"])

In [89]:
matches.head()

Unnamed: 0,week,day,date,time,unnamed: 4_level_1,result,ot,rec,venue,opp,...,totyd.1,passy.1,rushy.1,to.1,offense,defense,sp. tms,season,team,overall_date
0,1,Sun,September 8,1:00PM ET,boxscore,W,,1-0,,Arizona Cardinals,...,270.0,146.0,124.0,1.0,13.51,-3.22,-2.25,2024,Buffalo Bills,2024-09-08
1,2,Thu,September 12,8:15PM ET,boxscore,W,,2-0,@,Miami Dolphins,...,351.0,212.0,139.0,3.0,8.59,14.11,-2.53,2024,Buffalo Bills,2024-09-12
2,3,Mon,September 23,7:30PM ET,boxscore,W,,3-0,,Jacksonville Jaguars,...,239.0,147.0,92.0,2.0,28.39,12.44,-6.15,2024,Buffalo Bills,2024-09-23
3,4,Sun,September 29,8:20PM ET,boxscore,L,,3-1,@,Baltimore Ravens,...,427.0,156.0,271.0,1.0,-6.92,-19.69,2.37,2024,Buffalo Bills,2024-09-29
4,5,Sun,October 6,1:00PM ET,boxscore,L,,3-2,@,Houston Texans,...,425.0,331.0,94.0,2.0,-1.59,0.68,-2.09,2024,Buffalo Bills,2024-10-06


In [90]:
#Removing bye weeks
matches = matches[matches.opp != 'Bye Week']

In [91]:
#Making season an int object
matches['season'] = pd.to_numeric(matches['season'], errors='coerce')

In [92]:
#Adding venue code to separate home and away matches where home = 1 away = 0
matches["venue_code"] = matches["venue"].apply(lambda x: 1 if pd.isna(x) else 0)

In [93]:
#Adding opposition code
matches["opp_code"] = matches["opp"].astype("category").cat.codes

In [94]:
#Converting creating hour column from time
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [95]:
#Getting an integer value for day thus creating new column called day_code, useful for predicting
# Eg. Thursday football usually tough and influences results 
day_mapping = {
    "Mon": 0, "Tue": 1, "Wed": 2, "Thu": 3,
    "Fri": 4, "Sat": 5, "Sun": 6 }

matches["day_code"] = matches["day"].map(day_mapping)

In [96]:
#Creating a target column to see if Team won or not 
#If result is W give value of 1, if L or Tie give value of 0
#astype("int") converts True or False to 1 or 0
matches["target"] = (matches.result == 'W').astype("int")

In [97]:
#Importing RandomForest classifier 
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

In [98]:
forest = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state=1)

In [99]:
train = matches[matches.overall_date < '2024-01-01']

In [100]:
test = matches[matches.overall_date > '2024-01-01']

In [101]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]
result = "target"

In [102]:
forest.fit(train[predictors], train[result])

In [103]:
predictions = forest.predict(test[predictors])

In [104]:
from sklearn.metrics import accuracy_score, precision_score


In [105]:
accuracy = accuracy_score(test[result], predictions)
accuracy #Need to improve accuracy 

0.53125

In [106]:
precision = precision_score(test[result], predictions)
precision 

0.5313653136531366

In [107]:
grouped_matches = matches.groupby("team")

In [108]:
#Want to improve precision using rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("overall_date")
    rolling_stats = group[cols].rolling(2, closed = 'left').mean() #closed = 'left' ensures we don't use future data
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) #Removes all rows that have missing values eg. week 1 doesn't have values before
    return group

In [109]:
cols = ["tm", "opp.1", "totyd", "to", "totyd.1", "to.1"] #Will use points for, against, total yards turnovers for 
#both offence and defence in rolling averages
#Converting type into float then into int
matches[cols] = matches[cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
new_cols = [f"{c}_rolling" for c in cols]

In [110]:
#rolling_averages(group, cols, new_cols)

In [111]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [112]:
matches_rolling = matches_rolling.droplevel('team')

In [113]:
matches_rolling.index = range(matches_rolling.shape[0]) #Getting unique index for each match 

In [114]:
def make_predictions(data, predictors): #Function does the whole fitting of model and calculating precision
    train = data[data.overall_date < '2024-01-01']
    test = data[data.overall_date > '2024-01-01']
    forest.fit(train[predictors], train[result])
    predictions = forest.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test[result], predicted=predictions), index = test.index)
    precision = precision_score(test[result], predictions)
    return combined, precision  

In [115]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols) 
#Getting predicted dataframe and precision 

In [116]:
precision #Precision has improved 

0.5579710144927537

In [117]:
combined = combined.merge(matches_rolling[["overall_date", "team", "opp", "result"]], left_index=True, right_index=True)

In [118]:
combined

Unnamed: 0,actual,predicted,overall_date,team,opp,result
32,1,0,2024-01-05,Arizona Cardinals,San Francisco 49ers,W
33,0,0,2024-09-08,Arizona Cardinals,Buffalo Bills,L
34,1,1,2024-09-15,Arizona Cardinals,Los Angeles Rams,W
35,0,1,2024-09-22,Arizona Cardinals,Detroit Lions,L
36,0,1,2024-09-29,Arizona Cardinals,Washington Commanders,L
...,...,...,...,...,...,...
1561,0,1,2024-11-24,Washington Commanders,Dallas Cowboys,L
1562,1,0,2024-12-01,Washington Commanders,Tennessee Titans,W
1563,1,1,2024-12-15,Washington Commanders,New Orleans Saints,W
1564,1,1,2024-12-22,Washington Commanders,Philadelphia Eagles,W


In [119]:
merged_predictions = combined.merge(combined, left_on = ['overall_date', 'team'], right_on = ['overall_date', 'opp'])

In [120]:
merged_predictions

Unnamed: 0,actual_x,predicted_x,overall_date,team_x,opp_x,result_x,actual_y,predicted_y,team_y,opp_y,result_y
0,1,0,2024-01-05,Arizona Cardinals,San Francisco 49ers,W,0,1,San Francisco 49ers,Arizona Cardinals,L
1,0,0,2024-09-08,Arizona Cardinals,Buffalo Bills,L,1,1,Buffalo Bills,Arizona Cardinals,W
2,1,1,2024-09-15,Arizona Cardinals,Los Angeles Rams,W,0,1,Los Angeles Rams,Arizona Cardinals,L
3,0,1,2024-09-22,Arizona Cardinals,Detroit Lions,L,1,1,Detroit Lions,Arizona Cardinals,W
4,0,1,2024-09-29,Arizona Cardinals,Washington Commanders,L,1,0,Washington Commanders,Arizona Cardinals,W
...,...,...,...,...,...,...,...,...,...,...,...
539,0,1,2024-11-24,Washington Commanders,Dallas Cowboys,L,1,1,Dallas Cowboys,Washington Commanders,W
540,1,0,2024-12-01,Washington Commanders,Tennessee Titans,W,0,1,Tennessee Titans,Washington Commanders,L
541,1,1,2024-12-15,Washington Commanders,New Orleans Saints,W,0,0,New Orleans Saints,Washington Commanders,L
542,1,1,2024-12-22,Washington Commanders,Philadelphia Eagles,W,0,1,Philadelphia Eagles,Washington Commanders,L


In [121]:
#Calculating accuracy by extracting how many times TeamX predicted to win
#and teamY predicted to lose when actual game was a win or loss 

merged_predictions[(merged_predictions["predicted_x"] == 1) & (merged_predictions["predicted_y"]==0)]["actual_x"].value_counts()

actual_x
1    84
0    52
Name: count, dtype: int64

In [122]:
52/84 #Accuracy score has improved greatly 

0.6190476190476191