## Prepare and Clean NFL Datasets

In [1]:
import os
import pandas as pd

In [3]:
# years of evaluation
start_year = 2014
end_year = 2023

# datasets files
teams_dataset_file = os.path.join("datasets", "teams.csv")
standings_dataset_file = os.path.join("datasets" ,"standings.csv")
games_dataset_file = os.path.join("datasets", "games.csv")

teams_cleaned_dataset_file = os.path.join("datasets", "teams_clean.csv")
standings_cleaned_dataset_file = os.path.join("datasets" ,"standings_clean.csv")
games_cleaned_dataset_file = os.path.join("datasets", "games_clean.csv")

# seasons to evaluate
seasons_to_evaluate = [year for year in range(start_year, end_year + 1)]
# read teams dataset
teams = pd.read_csv(teams_dataset_file)

# removing irrelevant columns
teams.drop(["nfl", "nfl_team_id", "espn", "pfr", "pff", "pfflabel", "fo", "location", "short_location", "nickname", "hyphenated", "sbr", "sbr_wins", "sbr_name", "draft_kings"]
    , axis=1
    , inplace=True)
#display(teams.sample(10))

# keep last season records and set index on team short name
teams = teams[teams["season"] == end_year]
teams.set_index("team", inplace=True)
teams.drop("season", axis=1, inplace=True)
teams.rename(columns={"full" : "team_name"}, inplace=True)

display(teams)


Unnamed: 0_level_0,team_name
team,Unnamed: 1_level_1
ARI,Arizona Cardinals
ATL,Atlanta Falcons
BAL,Baltimore Ravens
BUF,Buffalo Bills
CAR,Carolina Panthers
CHI,Chicago Bears
CIN,Cincinnati Bengals
CLE,Cleveland Browns
DAL,Dallas Cowboys
DEN,Denver Broncos


In [4]:
# dictionaries for relocated teams: {"PREV_TEAM_ID" : "NEW_TEAM_ID"}
relocated_teams = {"OAK" : "LV", "SD" : "LAC", "STL" : "LA"}

# generic function to replace relocated team ids
def replace_relocated_teams(target_data_frame, column_name, relocated_teams):
    for prev_team_id, new_team_id in relocated_teams.items():
        target_data_frame.loc[target_data_frame[column_name] == prev_team_id, column_name] = new_team_id

In [5]:
# read standings dataset
all_standings = pd.read_csv(standings_dataset_file)
display(all_standings.sample(10))

# removing irrelevant columns
all_standings.drop(["sov", "sos"]
    , axis=1
    , inplace=True)
display(all_standings.sample(10))
print(all_standings.shape)

Unnamed: 0,season,conf,division,team,wins,losses,ties,pct,div_rank,scored,allowed,net,sov,sos,seed,playoff
314,2011,NFC,NFC South,NO,13,3,0,0.8125,1,547,339,208,0.442308,0.441406,3.0,LostDV
373,2013,NFC,NFC North,DET,7,9,0,0.4375,3,395,376,19,0.401786,0.457031,,
427,2015,AFC,AFC South,TEN,3,13,0,0.1875,4,299,423,-124,0.375,0.492188,,
131,2006,AFC,AFC East,NYJ,10,6,0,0.625,2,316,295,21,0.4,0.46875,5.0,LostWC
32,2003,AFC,AFC East,BUF,6,10,0,0.375,3,243,279,-36,0.4375,0.570312,,
485,2017,AFC,AFC North,CIN,7,9,0,0.4375,3,290,349,-59,0.321429,0.464844,,
523,2018,AFC,AFC South,TEN,9,7,0,0.5625,3,310,303,7,0.465278,0.519531,,
266,2010,AFC,AFC South,JAX,8,8,0,0.5,2,353,419,-66,0.382812,0.453125,,
496,2017,NFC,NFC East,DAL,9,7,0,0.5625,2,354,332,22,0.4375,0.496094,,
129,2006,AFC,AFC East,MIA,6,10,0,0.375,4,260,283,-23,0.53125,0.542969,,


Unnamed: 0,season,conf,division,team,wins,losses,ties,pct,div_rank,scored,allowed,net,seed,playoff
173,2007,AFC,AFC West,KC,4,12,0,0.25,3,226,335,-109,,
449,2016,AFC,AFC East,MIA,10,6,0,0.625,2,363,380,-17,6.0,LostWC
48,2003,NFC,NFC East,DAL,10,6,0,0.625,2,289,260,29,6.0,LostWC
697,2023,NFC,NFC South,CAR,2,15,0,0.117647,4,236,416,-180,,
569,2019,NFC,NFC South,CAR,5,11,0,0.3125,4,340,470,-130,,
341,2012,NFC,NFC North,DET,4,12,0,0.25,4,372,437,-65,,
323,2012,AFC,AFC East,NYJ,6,10,0,0.375,3,281,375,-94,,
691,2023,NFC,NFC East,WAS,4,13,0,0.235294,4,329,518,-189,,
364,2013,AFC,AFC West,DEN,13,3,0,0.8125,1,606,399,207,1.0,LostSB
320,2012,AFC,AFC East,BUF,6,10,0,0.375,4,344,435,-91,,


(736, 14)
