In [178]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [179]:
# Initialize df for team data
teams_df = pd.DataFrame(columns=['Team', 'W', 'L', 'T', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS', 'OSRS', 'DSRS'])
# Populate Team Data with web scraping
def get_team_data():
    teams = []
    W = []
    L = []
    T = []
    W_L = []
    PF = []
    PA = []
    PD = []
    MoV = []
    SoS = []
    SRS = []
    OSRS = []
    DSRS = []
    
    html_text = requests.get('https://www.pro-football-reference.com/years/2022/index.htm').text
    soup = BeautifulSoup(html_text, 'lxml')
    tables = soup.find_all('tbody')
    
    for table in tables:
        teams_data = table.find_all('tr', class_="")
        for team_data in teams_data:
            teams.append(team_data.find('a').text)
            W.append(team_data.find_all('td')[0].text)
            L.append(team_data.find_all('td')[1].text)
            T.append(team_data.find_all('td')[2].text)
            W_L.append(team_data.find_all('td')[3].text)
            PF.append(team_data.find_all('td')[4].text)
            PA.append(team_data.find_all('td')[5].text)
            PD.append(team_data.find_all('td')[6].text)
            MoV.append(team_data.find_all('td')[7].text)
            SoS.append(team_data.find_all('td')[8].text)
            SRS.append(team_data.find_all('td')[9].text)
            OSRS.append(team_data.find_all('td')[10].text)
            DSRS.append(team_data.find_all('td')[11].text)

    teams_df['Team'] = teams
    teams_df['W'] = W
    teams_df['L'] = L
    teams_df['T'] = T
    teams_df['W-L%'] = W_L
    teams_df['PF'] = PF
    teams_df['PA'] = PA
    teams_df['PD'] = PD
    teams_df['MoV'] = MoV
    teams_df['SoS'] = SoS
    teams_df['SRS'] = SRS
    teams_df['OSRS'] = OSRS
    teams_df['DSRS'] = DSRS
get_team_data()

In [180]:
# Display teams data
teams_df

Unnamed: 0,Team,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
0,Buffalo Bills,6,2,0,0.75,220,118,102,12.8,1.6,14.3,6.2,8.2
1,New York Jets,6,3,0,0.667,196,176,20,2.2,2.6,4.8,1.8,2.9
2,Miami Dolphins,6,3,0,0.667,213,224,-11,-1.2,3.4,2.2,3.2,-1.0
3,New England Patriots,5,4,0,0.556,203,166,37,4.1,-0.2,3.9,-0.1,4.0
4,Baltimore Ravens,6,3,0,0.667,235,196,39,4.3,3.5,7.8,6.4,1.4
5,Cincinnati Bengals,5,4,0,0.556,228,185,43,4.8,0.9,5.7,3.1,2.6
6,Cleveland Browns,3,5,0,0.375,200,199,1,0.1,0.2,0.3,3.0,-2.7
7,Pittsburgh Steelers,2,6,0,0.25,120,197,-77,-9.6,5.4,-4.2,-3.8,-0.3
8,Tennessee Titans,5,3,0,0.625,149,158,-9,-1.1,-0.5,-1.6,-2.7,1.1
9,Indianapolis Colts,3,5,1,0.389,132,183,-51,-5.7,-0.9,-6.6,-6.8,0.1


In [181]:
# Sort teams alphabetically for easy merging
teams_df_sort = teams_df.sort_values(by = 'Team').reset_index(drop=True)

In [182]:
# Initialize df for offense redzone data
off_red_df = pd.DataFrame(columns=['Team', 'off_red_per'])
# Populate offense redzone data with web scraping
def get_off_red_data():
    teams = []
    percent = []
    html_text = requests.get('https://www.teamrankings.com/nfl/stat/red-zone-scoring-pct?date=2022-11-10').text
    soup = BeautifulSoup(html_text, 'lxml')
    table = soup.find('tbody')
    for row in table.find_all('tr'):
        teams.append(row.find('a').text)
        percent.append(row.find_all('td')[2].text)
    off_red_df['Team'] = teams
    off_red_df['off_red_per'] = percent
get_off_red_data()

In [183]:
# Sort teams alphabetically for easy merging
off_red_df_sort = off_red_df.sort_values(by = 'Team').reset_index(drop=True)

In [184]:
# Create new team data df with offense redzone data
new_teams_df = pd.concat([teams_df_sort, off_red_df_sort['off_red_per']], axis=1)
new_teams_df['off_red_per'] = new_teams_df['off_red_per'].str.rstrip('%').astype('float') / 100.0

In [185]:
# Initialize Schedule df for 2022
schedule_df = pd.read_csv('spreadspoke_scores.csv').groupby('schedule_season').get_group(2022)

In [186]:
# Merge team data with schedule data
df = schedule_df.join(new_teams_df.set_index('Team'), on='team_home').join(new_teams_df.set_index('Team'), on='team_away', rsuffix='_away')
pd.set_option('display.max_columns', None)

In [187]:
# Visualize data
df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,off_red_per,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,off_red_per_away
13232,9/8/2022,2022,1,False,Los Angeles Rams,10.0,31.0,Buffalo Bills,BUF,-2.0,52,SoFi Stadium,False,72.0,0.0,,DOME,3,5,0,0.375,131,173,-42,-5.3,1.0,-4.3,-4.0,-0.3,0.5,6,2,0,0.75,220,118,102,12.8,1.6,14.3,6.2,8.2,0.5357
13233,9/11/2022,2022,1,False,Arizona Cardinals,21.0,44.0,Kansas City Chiefs,KC,-6.0,54,State Farm Stadium,False,,,,,3,6,0,0.333,203,241,-38,-4.2,0.1,-4.1,-1.0,-3.1,0.5769,6,2,0,0.75,243,189,54,6.8,-0.6,6.1,9.3,-3.2,0.7353
13234,9/11/2022,2022,1,False,Atlanta Falcons,26.0,27.0,New Orleans Saints,NO,-5.5,44,Mercedes-Benz Stadium,False,,,,,4,6,0,0.4,232,250,-18,-1.8,-1.4,-3.2,0.4,-3.6,0.6207,3,6,0,0.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.5714
13235,9/11/2022,2022,1,False,Carolina Panthers,24.0,26.0,Cleveland Browns,CAR,-1.0,42,Bank of America Stadium,False,,,,,3,7,0,0.3,204,243,-39,-3.9,-1.2,-5.1,-1.9,-3.2,0.5455,3,5,0,0.375,200,199,1,0.1,0.2,0.3,3.0,-2.7,0.6129
13236,9/11/2022,2022,1,False,Chicago Bears,19.0,10.0,San Francisco 49ers,SF,-6.5,38,Soldier Field,False,,,,,3,6,0,0.333,187,216,-29,-3.2,0.7,-2.5,0.7,-3.2,0.5517,4,4,0,0.5,176,147,29,3.6,-1.7,2.0,-1.2,3.2,0.5769


In [188]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [189]:
# Determine winner and loser for completed games
for index, row in comp_games.iterrows():
    if comp_games.loc[index, 'score_home'] > comp_games.loc[index, 'score_away']:
        comp_games.loc[index, 'Home_Winner'] = 1
    else:
        comp_games.loc[index, 'Home_Winner'] = 0
    # else:
    #     comp_games.at[index, 'Winner'] = "T"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.loc[index, 'Home_Winner'] = 0


In [190]:
# Schedule data
schedule_df

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
13232,9/8/2022,2022,1,False,Los Angeles Rams,10.0,31.0,Buffalo Bills,BUF,-2.0,52,SoFi Stadium,False,72.0,0.0,,DOME
13233,9/11/2022,2022,1,False,Arizona Cardinals,21.0,44.0,Kansas City Chiefs,KC,-6.0,54,State Farm Stadium,False,,,,
13234,9/11/2022,2022,1,False,Atlanta Falcons,26.0,27.0,New Orleans Saints,NO,-5.5,44,Mercedes-Benz Stadium,False,,,,
13235,9/11/2022,2022,1,False,Carolina Panthers,24.0,26.0,Cleveland Browns,CAR,-1.0,42,Bank of America Stadium,False,,,,
13236,9/11/2022,2022,1,False,Chicago Bears,19.0,10.0,San Francisco 49ers,SF,-6.5,38,Soldier Field,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13499,1/8/2023,2022,18,False,Philadelphia Eagles,,,New York Giants,,,,Lincoln Financial Field,False,,,,
13500,1/8/2023,2022,18,False,Pittsburgh Steelers,,,Cleveland Browns,,,,Acrisure Stadium,False,,,,
13501,1/8/2023,2022,18,False,San Francisco 49ers,,,Arizona Cardinals,,,,Levi's Stadium,False,,,,
13502,1/8/2023,2022,18,False,Seattle Seahawks,,,Los Angeles Rams,,,,Lumen Field,False,,,,


In [191]:
# Remove unnecessary columns
comp_games.drop(schedule_df.iloc[0:0, 0:4] + schedule_df.iloc[0:0, 5:7] + schedule_df.iloc[0:0, 8:], axis=1, inplace=True)
uncomp_games.drop(schedule_df.iloc[0:0, 0:4] + schedule_df.iloc[0:0, 5:7] + schedule_df.iloc[0:0, 8:], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.drop(schedule_df.iloc[0:0, 0:4] + schedule_df.iloc[0:0, 5:7] + schedule_df.iloc[0:0, 8:], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncomp_games.drop(schedule_df.iloc[0:0, 0:4] + schedule_df.iloc[0:0, 5:7] + schedule_df.iloc[0:0, 8:], axis=1, inplace=True)


In [192]:
# Display training data
comp_games

Unnamed: 0,team_home,team_away,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,off_red_per,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,off_red_per_away,Home_Winner
13232,Los Angeles Rams,Buffalo Bills,3,5,0,.375,131,173,-42,-5.3,1.0,-4.3,-4.0,-0.3,0.5000,6,2,0,.750,220,118,102,12.8,1.6,14.3,6.2,8.2,0.5357,0.0
13233,Arizona Cardinals,Kansas City Chiefs,3,6,0,.333,203,241,-38,-4.2,0.1,-4.1,-1.0,-3.1,0.5769,6,2,0,.750,243,189,54,6.8,-0.6,6.1,9.3,-3.2,0.7353,0.0
13234,Atlanta Falcons,New Orleans Saints,4,6,0,.400,232,250,-18,-1.8,-1.4,-3.2,0.4,-3.6,0.6207,3,6,0,.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.5714,0.0
13235,Carolina Panthers,Cleveland Browns,3,7,0,.300,204,243,-39,-3.9,-1.2,-5.1,-1.9,-3.2,0.5455,3,5,0,.375,200,199,1,0.1,0.2,0.3,3.0,-2.7,0.6129,0.0
13236,Chicago Bears,San Francisco 49ers,3,6,0,.333,187,216,-29,-3.2,0.7,-2.5,0.7,-3.2,0.5517,4,4,0,.500,176,147,29,3.6,-1.7,2.0,-1.2,3.2,0.5769,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13350,New Orleans Saints,Las Vegas Raiders,3,6,0,.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.5714,2,6,0,.250,183,201,-18,-2.3,-1.9,-4.2,-0.1,-4.0,0.5313,1.0
13351,New York Jets,New England Patriots,6,3,0,.667,196,176,20,2.2,2.6,4.8,1.8,2.9,0.6000,5,4,0,.556,203,166,37,4.1,-0.2,3.9,-0.1,4.0,0.5000,0.0
13352,Philadelphia Eagles,Pittsburgh Steelers,8,0,0,1.000,225,135,90,11.3,-2.0,9.3,5.5,3.8,0.7000,2,6,0,.250,120,197,-77,-9.6,5.4,-4.2,-3.8,-0.3,0.4762,1.0
13353,Seattle Seahawks,New York Giants,6,3,0,.667,241,220,21,2.3,-2.3,0.0,3.6,-3.6,0.4828,6,2,0,.750,163,157,6,0.8,0.2,1.0,-1.8,2.8,0.4615,1.0


In [193]:
# Set X and Y
X = comp_games.drop(['Home_Winner'], 1)
X_pred = uncomp_games
y = comp_games['Home_Winner']

# Scale data for more accurate predictions
from sklearn.preprocessing import scale
cols = [comp_games.columns[2:-1]]
for col in cols:
    X[col] = scale(X[col])
cols2 = [uncomp_games.columns[2:]]
for col in cols2:
    X_pred[col] = scale(X_pred[col])

  X = comp_games.drop(['Home_Winner'], 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_pred[col] = scale(X_pred[col])


In [194]:
# Columnize any strings in dataset
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.items():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output

X = preprocess_features(X)
X_pred = preprocess_features(X_pred)

In [195]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [196]:
# Initialize algorithms
LR_clf = LogisticRegression()
SVC_clf = SVC()
xgb_clf = xgb.XGBClassifier(seed = 1)

# Fit algorithms to training data
LR_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Predict using algorithms
LR_pred = LR_clf.predict(X_test)
SVC_pred = SVC_clf.predict(X_test)
XGB_pred = xgb_clf.predict(X_test)

# Cross validate algorithms
LR_score = cross_val_score(LR_clf, X, y)
SVC_score = cross_val_score(SVC_clf, X, y)
XGB_score = cross_val_score(xgb_clf, X, y)

In [197]:
# Print accuracy and score of algorithms
print(f'Linear Regression:', '\nAccuracy:', accuracy_score(y_test, LR_pred), '\nF1 Score: ', f1_score(y_test, LR_pred), '\nCross Validation Score:', LR_score.mean())
print()
print(f'SVC:', '\nAccuracy:', accuracy_score(y_test, SVC_pred), '\nF1 Score: ', f1_score(y_test, SVC_pred), '\nCross Validation Score: ', SVC_score.mean())
print()
print(f'XGBoost:', '\nAccuracy:', accuracy_score(y_test, XGB_pred), '\nF1 Score: ', f1_score(y_test, XGB_pred), '\nCross Validation Score: ', XGB_score.mean())

Linear Regression: 
Accuracy: 0.6923076923076923 
F1 Score:  0.7142857142857143 
Cross Validation Score: 0.5373333333333334

SVC: 
Accuracy: 0.8461538461538461 
F1 Score:  0.8571428571428571 
Cross Validation Score:  0.676

XGBoost: 
Accuracy: 0.7692307692307693 
F1 Score:  0.7999999999999999 
Cross Validation Score:  0.5686666666666667


In [198]:
predictions = SVC_clf.predict(X_pred)

uncomp_games['Home_Winner'] = predictions

cols = uncomp_games.columns.tolist()
cols = cols[-1:] + cols[:-1]
uncomp_games = uncomp_games[cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncomp_games['Home_Winner'] = predictions


In [203]:
# Show predictions for week 9
pd.set_option('display.max_columns', None)
uncomp_games[13:27]

Unnamed: 0,Home_Winner,team_home,team_away,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,off_red_per,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,off_red_per_away
13368,1.0,Carolina Panthers,Atlanta Falcons,-0.74197,1.544414,-0.253546,-0.990786,0.493641,1.66247,-0.938783,-0.78544,-0.627658,-1.029238,-0.496903,-0.96224,-0.238138,-0.153225,1.006367,-0.268221,-0.515727,1.285262,1.886618,-0.438005,-0.370469,-0.727165,-0.633223,0.08079,-1.057628,0.522485
13369,1.0,Buffalo Bills,Minnesota Vikings,1.031616,-1.255072,-0.253546,1.190752,0.949251,-2.00055,2.421163,2.489925,0.860177,2.811604,1.60293,2.376119,-0.338137,1.602973,-1.791184,-0.268221,1.776158,0.18434,-0.747457,0.76433,0.776234,-0.40974,0.610017,0.206707,0.683497,0.769264
13370,1.0,Chicago Bears,Detroit Lions,-0.74197,0.984517,-0.253546,-0.830807,0.009556,0.871258,-0.700489,-0.648149,0.381944,-0.514486,0.177117,-0.96224,-0.174873,-1.324024,1.006367,-0.268221,-1.239479,0.043196,1.413076,-1.111312,-1.161299,1.01867,-0.749777,0.710377,-2.001968,1.465282
13371,0.0,Green Bay Packers,Dallas Cowboys,-0.74197,0.984517,-0.253546,-0.830807,-0.93014,0.050741,-0.819636,-0.765827,1.072724,-0.356101,-0.782066,0.32625,-0.702422,1.017574,-1.231674,-0.268221,1.17303,-0.097948,-1.576154,1.19717,1.230961,-0.14522,1.134509,0.307441,1.391752,0.680625
13372,1.0,Kansas City Chiefs,Jacksonville Jaguars,1.031616,-1.255072,-0.253546,1.190752,1.604191,0.080046,1.277352,1.313147,-0.308836,1.188155,2.40657,-0.96224,1.69859,-0.738625,1.006367,-0.268221,-0.839003,0.353712,-0.244319,0.499816,0.440131,-1.679438,-0.167008,0.08079,-0.349374,-0.397145
13373,1.0,Las Vegas Raiders,Indianapolis Colts,-1.333165,0.984517,-0.253546,-1.233179,-0.104347,0.431696,-0.438365,-0.471633,-0.999616,-0.851055,-0.030274,-1.196511,-0.383035,-0.738625,0.446857,3.72827,-0.568802,-1.537615,-0.096337,-1.231545,-1.141528,-0.462644,-1.293694,-1.732422,0.034264,-1.297637
13374,1.0,Los Angeles Rams,Arizona Cardinals,-0.74197,0.42462,-0.253546,-0.627196,-1.58508,-0.388821,-1.010271,-1.060021,0.541355,-0.870853,-1.041304,-0.113008,-0.702422,-0.738625,1.006367,-0.268221,-0.839003,0.466627,1.620251,-0.918938,-0.844967,0.066396,-0.808054,-0.271779,-0.910075,0.081304
13375,1.0,Miami Dolphins,Cleveland Browns,1.031616,-0.695174,-0.253546,0.78838,0.749922,1.105691,-0.271559,-0.25589,1.816641,0.416027,0.825214,-0.317995,1.233285,-0.738625,0.446857,-0.268221,-0.636352,0.381941,0.377204,0.018882,0.005175,0.1193,0.046674,0.735561,-0.792033,0.443919
13376,1.0,New York Giants,Houston Texans,1.031616,-1.255072,-0.253546,1.190752,-0.673859,-0.857688,0.133541,0.136369,0.116259,0.178449,-0.470979,0.794791,-1.095277,-1.909424,1.006367,3.72827,-1.538631,-1.509387,-0.096337,-1.207499,-1.260153,-0.938781,-1.585079,-1.405036,-0.733012,-0.42837
13377,1.0,Pittsburgh Steelers,New Orleans Saints,-1.333165,0.984517,-0.253546,-1.233179,-1.898311,0.314479,-1.8443,-1.903379,2.87938,-0.851055,-0.989456,-0.113008,-0.945278,-0.738625,1.006367,-0.268221,-0.839003,0.720686,1.205902,-0.365865,-0.350699,-0.303932,-0.458393,0.15634,-0.880565,0.025905
