In [1640]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

In [1657]:
# Initialize df for team data
teams_df = pd.DataFrame(columns=['Team', 'W', 'L', 'T', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS', 'OSRS', 'DSRS'])
# Populate Team Data with web scraping
def get_team_data():
    teams = []
    W = []
    L = []
    T = []
    W_L = []
    PF = []
    PA = []
    PD = []
    MoV = []
    SoS = []
    SRS = []
    OSRS = []
    DSRS = []
    
    html_text = requests.get('https://www.pro-football-reference.com/years/2022/index.htm').text
    soup = BeautifulSoup(html_text, 'lxml')
    tables = soup.find_all('tbody')
    
    for table in tables:
        teams_data = table.find_all('tr', class_="")
        for team_data in teams_data:
            teams.append(team_data.find('a').text)
            W.append(team_data.find_all('td')[0].text)
            L.append(team_data.find_all('td')[1].text)
            T.append(team_data.find_all('td')[2].text)
            W_L.append(team_data.find_all('td')[3].text)
            PF.append(team_data.find_all('td')[4].text)
            PA.append(team_data.find_all('td')[5].text)
            PD.append(team_data.find_all('td')[6].text)
            MoV.append(team_data.find_all('td')[7].text)
            SoS.append(team_data.find_all('td')[8].text)
            SRS.append(team_data.find_all('td')[9].text)
            OSRS.append(team_data.find_all('td')[10].text)
            DSRS.append(team_data.find_all('td')[11].text)
    teams_df['Team'] = teams
    teams_df['W'] = W
    teams_df['L'] = L
    teams_df['T'] = T
    teams_df['W-L%'] = W_L
    teams_df['PF'] = PF
    teams_df['PA'] = PA
    teams_df['PD'] = PD
    teams_df['MoV'] = MoV
    teams_df['SoS'] = SoS
    teams_df['SRS'] = SRS
    teams_df['OSRS'] = OSRS
    teams_df['DSRS'] = DSRS
get_team_data()
teams_df

Unnamed: 0,Team,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
0,Buffalo Bills,6,2,0,0.75,220,118,102,12.8,1.5,14.3,6.1,8.2
1,New York Jets,6,3,0,0.667,196,176,20,2.2,2.5,4.7,1.8,2.9
2,Miami Dolphins,6,3,0,0.667,213,224,-11,-1.2,3.4,2.2,3.2,-1.0
3,New England Patriots,5,4,0,0.556,203,166,37,4.1,-0.3,3.9,-0.1,3.9
4,Baltimore Ravens,6,3,0,0.667,235,196,39,4.3,3.4,7.7,6.4,1.3
5,Cincinnati Bengals,5,4,0,0.556,228,185,43,4.8,0.9,5.6,3.0,2.7
6,Cleveland Browns,3,5,0,0.375,200,199,1,0.1,0.2,0.3,2.9,-2.5
7,Pittsburgh Steelers,2,6,0,0.25,120,197,-77,-9.6,5.4,-4.2,-3.8,-0.4
8,Tennessee Titans,5,3,0,0.625,149,158,-9,-1.1,-0.6,-1.7,-2.8,1.1
9,Indianapolis Colts,3,5,1,0.389,132,183,-51,-5.7,-1.0,-6.7,-6.8,0.1


In [1658]:
# Initialize Schedule df for 2022
schedule_df = pd.read_csv('spreadspoke_scores.csv').groupby('schedule_season').get_group(2022)
schedule_df

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
13232,9/8/2022,2022,1,False,Los Angeles Rams,10.0,31.0,Buffalo Bills,BUF,-2.0,52,SoFi Stadium,False,72.0,0.0,,DOME
13233,9/11/2022,2022,1,False,Arizona Cardinals,21.0,44.0,Kansas City Chiefs,KC,-6.0,54,State Farm Stadium,False,,,,
13234,9/11/2022,2022,1,False,Atlanta Falcons,26.0,27.0,New Orleans Saints,NO,-5.5,44,Mercedes-Benz Stadium,False,,,,
13235,9/11/2022,2022,1,False,Carolina Panthers,24.0,26.0,Cleveland Browns,CAR,-1.0,42,Bank of America Stadium,False,,,,
13236,9/11/2022,2022,1,False,Chicago Bears,19.0,10.0,San Francisco 49ers,SF,-6.5,38,Soldier Field,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13499,1/8/2023,2022,18,False,Philadelphia Eagles,,,New York Giants,,,,Lincoln Financial Field,False,,,,
13500,1/8/2023,2022,18,False,Pittsburgh Steelers,,,Cleveland Browns,,,,Acrisure Stadium,False,,,,
13501,1/8/2023,2022,18,False,San Francisco 49ers,,,Arizona Cardinals,,,,Levi's Stadium,False,,,,
13502,1/8/2023,2022,18,False,Seattle Seahawks,,,Los Angeles Rams,,,,Lumen Field,False,,,,


In [1643]:
df = schedule_df.merge(teams_df, left_on='team_home', right_on='Team', how='left').merge(teams_df, left_on='team_away', right_on='Team', how='left')
# df.drop(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff', 'stadium', 'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail'], axis=1, inplace=True)

In [1644]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [1645]:
for index, row in comp_games.iterrows():
    if comp_games.loc[index, 'score_home'] > comp_games.loc[index, 'score_away']:
        comp_games.loc[index, 'Home_Winner'] = True
    else:
        comp_games.loc[index, 'Home_Winner'] = False
    # else:
    #     comp_games.at[index, 'Winner'] = 'Tie'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.loc[index, 'Home_Winner'] = False


In [1646]:
comp_games.drop(schedule_df.iloc[0:0], axis=1, inplace=True)
uncomp_games.drop(schedule_df.iloc[0:0], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.drop(schedule_df.iloc[0:0], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncomp_games.drop(schedule_df.iloc[0:0], axis=1, inplace=True)


In [1647]:
y = comp_games['Home_Winner'].astype(int)
comp_games_X = comp_games.copy()
comp_games_X.drop(['Home_Winner', 'Team_x', 'Team_y'], axis=1, inplace=True)
X = comp_games_X

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

In [1655]:
comp_games.columns

Index(['Team_x', 'W_x', 'L_x', 'T_x', 'W-L%_x', 'PF_x', 'PA_x', 'PD_x',
       'MoV_x', 'SoS_x', 'SRS_x', 'OSRS_x', 'DSRS_x', 'Team_y', 'W_y', 'L_y',
       'T_y', 'W-L%_y', 'PF_y', 'PA_y', 'PD_y', 'MoV_y', 'SoS_y', 'SRS_y',
       'OSRS_y', 'DSRS_y', 'Home_Winner'],
      dtype='object')

In [1649]:
DT = DecisionTreeClassifier()
LSVC = LinearSVC()
GND = GaussianNB()
DT.fit(X_train,y_train)
LSVC.fit(X_train,y_train)
GND.fit(X_train,y_train)



In [1650]:
dt_pred = DT.predict(X_test)
lsvc_pred = LSVC.predict(X_test)
gnd_pred = GND.predict(X_test)

In [1651]:
print(accuracy_score(y_test,dt_pred))
print(f1_score(y_test,dt_pred))

0.46153846153846156
0.5882352941176471


In [1652]:
print(accuracy_score(y_test,lsvc_pred))
print(f1_score(y_test,lsvc_pred))

0.7692307692307693
0.8571428571428571


In [1653]:
print(accuracy_score(y_test,gnd_pred))
print(f1_score(y_test,gnd_pred))

0.7692307692307693
0.823529411764706


In [1654]:
# cross validation