In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from tabulate import tabulate
import warnings
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [2]:
def takeSecond(elem):
    return elem[3]

In [3]:
warnings.filterwarnings('ignore')
le = preprocessing.LabelEncoder()

In [4]:
symbols = ['CRD', 'ATL', 'RAV', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GNB', 'HTX', 'CLT', 'JAX', 'KAN', 'RAI', 'SDG', 'RAM', 'MIA', 'MIN', 'NWE', 'NOR', 'NYG', 'NYJ', 'PHI', 'PIT', 'SFO', 'SEA', 'TAM', 'OTI', 'WAS']
team_names = ['Arizona Cardinals', 'Atlanta Falcons', 'Baltimore Ravens', 'Buffalo Bills', 'Carolina Panthers', 'Chicago Bears', 'Cincinnati Bengals', 'Cleveland Browns', 'Dallas Cowboys', 'Denver Broncos', 'Detroit Lions', 'Green Bay Packers', 'Houston Texans', 'Indianapolis Colts', 'Jacksonville Jaguars', 'Kansas City Chiefs', 'Las Vegas Raiders', 'Los Angeles Chargers', 'Los Angeles Rams', 'Miami Dolphins', 'Minnesota Vikings', 'New England Patriots', 'New Orleans Saints', 'New York Giants', 'New York Jets', 'Philadelphia Eagles', 'Pittsburgh Steelers', 'San Francisco 49ers', 'Seattle Seahawks', 'Tampa Bay Buccaneers', 'Tennessee Titans', 'Washington Football Team']

In [5]:
def get_new_data(team, year):
    sym = symbols[team_names.index(team)].lower()
    url = f'https://www.pro-football-reference.com/teams/{sym}/{year}.htm'
    html = requests.get(url).text
    soup = BeautifulSoup(html,'html.parser')
    table = soup.find('table', id='games')
    tablerows = table.find_all('tr')[2:]
    data = []

    for tablerow in tablerows:
        data.append([tabledata.get_text(strip=True) for tabledata in tablerow.find_all('td')])

    df = pd.DataFrame(data)
    index = [0,1,4,8,9,10, 7] + list(range(11,21))
    new_data = df.iloc[:,index].copy()
    col_names = ['day', 'date', 'result', 'opponent', 'tm_score', 'opp_score','location', '1stD_offense', 'TotYd_offense', 'PassY_offense', 'RushY_offense', 'TO_offense', '1stD_defense', 'TotYd_defense', 'PassY_defense', 'RushY_defense', 'TO_defense']
    new_data.columns = col_names
    result_encoder = {'result': {'L': 0, 'T': 0,'W': 1,'' : pd.NA},
                     'TO_offense' : {'' : 0},
                     'TO_defense' : {'' : 0}},
    location_encoder = {'location': {'@' : 0, '': 1}}
    new_data.replace(location_encoder, inplace=True)
    new_data.replace(result_encoder, inplace=True)
    new_data = new_data[new_data.result.notnull()]
    week = list(range(1,len(new_data)+1))
    new_data.insert(0, 'week', week)
    tn_col = pd.Series([f'{team}']).repeat(len(new_data)).reset_index(drop=True)
    new_data.insert(0, 'team_name', tn_col)

    if type(new_data) == pd.Series:
        new_data = new_data.to_frame().T
        return new_data.reset_index(drop=True)
    else:
        return new_data.reset_index(drop=True)

In [53]:
def home_or_away(team, week):
    sym = symbols[team_names.index(team)].lower()
    url = f'https://www.pro-football-reference.com/teams/{sym}/2022.htm'
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='games')
    tablerows = table.find_all('tr')[2:]
    data = []

    for tablerow in tablerows:
            data.append([tabledata.get_text(strip=True) for tabledata in tablerow.find_all('td')])

    df = pd.DataFrame(data)
    index = [0,1,4,8,9,10, 7] + list(range(11,21))
    new_data = df.iloc[:,index].copy()
    col_names = ['day', 'date', 'result', 'opponent', 'tm_score', 'opp_score','location', '1stD_offense', 'TotYd_offense', 'PassY_offense', 'RushY_offense', 'TO_offense', '1stD_defense', 'TotYd_defense', 'PassY_defense', 'RushY_defense', 'TO_defense']
    new_data.columns = col_names
    if df.iloc[week - 1][7] == '@':
        return 0
    else: 
        return 1

In [6]:
nfl_dataframe_list = []
years = [*range(2015, 2022)]
for year in years:
    for teamname in team_names:
        nfl_dataframe_list.append(get_new_data(teamname, year))

nfldata = pd.concat(nfl_dataframe_list)
nfldata = nfldata[nfldata.opponent != 'Bye Week']
nfldata = nfldata[nfldata.date != 'Playoffs']
nfldata = nfldata[nfldata.location != 'N']
to_offence_encoder = {'TO_offense': {'' : 0}}
to_defence_encoder = {'TO_defense': {'' : 0}}
result_encoder = {'result': {'L': 0, 'T': 0,'W': 1,'' : pd.NA}}
nfldata.replace(to_offence_encoder, inplace=True)
nfldata.replace(to_defence_encoder, inplace=True)
nfldata.replace(result_encoder, inplace=True)

In [8]:
features = nfldata.iloc[:,8:]
scaler = StandardScaler()
scaler.fit(features)
scaler.transform(features)

array([[ 1.        ,  0.93076638,  0.93062701, ...,  1.52121329,
        -1.15775205, -0.29205689],
       [-1.        ,  0.13163211, -0.57712166, ..., -0.13056387,
        -0.06971193,  0.52648528],
       [ 1.        ,  1.53011708,  1.15619571, ..., -2.36304394,
        -0.18840722,  2.16356961],
       ...,
       [-1.        , -1.26685287, -1.08761924, ...,  1.97287111,
        -0.08949448, -1.11059905],
       [ 1.        ,  0.73098281, -0.43465722, ..., -0.311227  ,
         0.108331  , -1.11059905],
       [-1.        , -0.86728573, -0.28032074, ..., -1.97590867,
        -0.36645015,  1.34502744]])

In [9]:
y = nfldata['result']
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.5, random_state=42)

In [10]:
lrc = LogisticRegression(max_iter=1000)
lrc.fit(X_train, y_train)

In [11]:
y_pred = lrc.predict(X_test)
accuracy_score(y_test, y_pred)

0.8177470775770457

In [60]:
penalties = ['l1', 'l2']
C = [0.001, 0.01, 0.1, 1.0, 10.0, 1000.0, 10000.0]

max_accuracy = float(0)
optimal_penalty = ''
optimal_c = ''
for penalty in penalties:
    for c in C:
        lrc_tuned = LogisticRegression(penalty=penalty, C=c, solver='liblinear', max_iter=1000)
        lrc_tuned.fit(X_train, y_train)
        y_pred = lrc_tuned.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_rd = round(accuracy*100,1)
        if accuracy_rd > max_accuracy:
            max_accuracy = accuracy_rd
            optimal_c = c
            optimal_penalty = penalty
        
print(f'\nHighest Accuracy: {max_accuracy} | penalty = {optimal_penalty}, C = {optimal_c}')


Highest Accuracy: 81.7 | penalty = l1, C = 1.0


In [59]:
penalty = 'l1'
C = 1.0

test_sizes = [val/100 for val in range(20,36)]
max_accuracy = 0
optimal_test_size = 0

for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=test_size, random_state=42)
    lrc_tts = LogisticRegression(penalty = penalty, C = C, solver='liblinear', max_iter=1000)
    lrc_tts.fit(X_train, y_train)
    y_pred = lrc_tts.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_rd = round(accuracy*100,1)
    if accuracy_rd > max_accuracy:
        max_accuracy = accuracy_rd
        optimal_test_size = test_size
    
print(f'\nHighest Accuracy: {max_accuracy}% | test size = {optimal_test_size}')


Highest Accuracy: 82.1% | test size = 0.29


In [16]:
test_size = 0.29
penalty = 'l1'
C = 1.0

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=test_size, random_state=42)
optLr = LogisticRegression(penalty = penalty, C = C, solver='liblinear', max_iter=1000)
optLr.fit(X_train, y_train)

In [50]:
def predict(team, week):
    
    data = get_new_data(team, 2022)
    columns = list(data.columns[8:])
    for column in columns:
        data[column] = pd.to_numeric(data[column])
    home = home_or_away(team, week)
    stats = []
    for column in data.columns[8:]:
        stats.append(int(data[column].mean().round()))    

    new_data = pd.DataFrame([stats], columns=columns)
    new_data['location'].iloc[0] = home
    new_X = new_data.loc[:,features.columns]
    new_X_sc = scaler.transform(new_X)
    prediction = optLr.predict(new_X_sc)
    prob = optLr.predict_proba(new_X_sc)

    return prediction, prob

In [58]:
predictions = []
week = input("Week: ")
for team in team_names:
    output = []
    prediction = predict(team, int(week))
    output.append(team)
    output.append(prediction[0])
    output.append(prediction[1][0][1])
    output.append(prediction[1][0][0])
    predictions.append(output)   
predictions.sort(key=takeSecond)
headers = ['Team', 'Prediction', 'Prob of Win', 'Prob of Loss']
print(tabulate(predictions, headers, tablefmt='simple', floatfmt=".6f"))

Week:  7


Team                        Prediction    Prob of Win    Prob of Loss
------------------------  ------------  -------------  --------------
Buffalo Bills                 1.000000       0.768514        0.231486
Philadelphia Eagles           1.000000       0.767308        0.232692
Denver Broncos                1.000000       0.764841        0.235159
Arizona Cardinals             1.000000       0.763457        0.236543
Los Angeles Chargers          1.000000       0.763150        0.236850
Los Angeles Rams              1.000000       0.762861        0.237139
Seattle Seahawks              1.000000       0.611807        0.388193
San Francisco 49ers           1.000000       0.591717        0.408283
Jacksonville Jaguars          1.000000       0.588992        0.411008
New England Patriots          1.000000       0.587663        0.412337
Baltimore Ravens              1.000000       0.587456        0.412544
Dallas Cowboys                1.000000       0.586178        0.413822
Miami Dolphins      