In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('data.csv')

In [3]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [4]:
# Drop scores for uncompleted games (na values)
uncomp_games = uncomp_games.drop(['score_home', 'score_away'], axis=1)

In [5]:
# Display training data
comp_games

Unnamed: 0,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,Season,W,L,T,...,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away
0,1,False,Los Angeles Rams,10.0,31.0,Buffalo Bills,2022,3,5,0,...,0,0.750,220,118,102,12.8,1.6,14.3,6.2,8.2
1,1,False,Arizona Cardinals,21.0,44.0,Kansas City Chiefs,2022,3,6,0,...,0,0.750,243,189,54,6.8,-0.6,6.1,9.3,-3.2
2,1,False,Atlanta Falcons,26.0,27.0,New Orleans Saints,2022,4,6,0,...,0,0.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0
3,1,False,Carolina Panthers,24.0,26.0,Cleveland Browns,2022,3,7,0,...,0,0.375,200,199,1,0.1,0.2,0.3,3.0,-2.7
4,1,False,Chicago Bears,19.0,10.0,San Francisco 49ers,2022,3,6,0,...,0,0.500,176,147,29,3.6,-1.7,2.0,-1.2,3.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,17,False,Minnesota Vikings,19.0,21.0,Chicago Bears,2019,10,6,0,...,0,0.500,280,298,-18,-1.1,0.2,-0.9,-5.4,4.5
1052,17,False,New England Patriots,24.0,27.0,Miami Dolphins,2019,12,4,0,...,0,0.313,306,494,-188,-11.8,0.2,-11.6,-2.4,-9.1
1053,17,False,New York Giants,17.0,34.0,Philadelphia Eagles,2019,4,12,0,...,0,0.563,385,354,31,1.9,-1.7,0.3,0.7,-0.4
1054,17,False,Seattle Seahawks,21.0,26.0,San Francisco 49ers,2019,11,5,0,...,0,0.813,479,310,169,10.6,0.4,11.0,6.7,4.3


In [6]:
# Set X and Y
X = comp_games.drop(['score_home', 'score_away'], 1)
X_pred = uncomp_games
y = comp_games[comp_games.columns[3:5]]

# Scale data for more accurate predictions
from sklearn.preprocessing import scale
cols = [comp_games.columns[6:]]
for col in cols:
    X[col] = scale(X[col])

# Standardize data for predictions    
cols2 = [uncomp_games.columns[4:]]
for col in cols2:
    X_pred[col] = scale(X_pred[col])

  X = comp_games.drop(['score_home', 'score_away'], 1)


In [7]:
y.shape

(907, 2)

In [8]:
# Columnize any strings in dataset
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.items():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output

X = preprocess_features(X)
X_pred = preprocess_features(X_pred)

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [10]:
# Initialize algorithms
RFR = RandomForestRegressor(max_depth=2, random_state=0)
LR = LinearRegression()
KNR = KNeighborsRegressor(n_neighbors=2)
MO_RFR_clf = MultiOutputRegressor(estimator=RFR)
MO_LR_clf = MultiOutputRegressor(estimator=LR)
MO_KNR_clf = MultiOutputRegressor(estimator=KNR)

# Fit algorithms to training data
MO_RFR_clf.fit(X_train, y_train)
MO_LR_clf.fit(X_train, y_train)
MO_KNR_clf.fit(X_train, y_train)

# Predict using algorithms
MO_RFR_pred = MO_RFR_clf.predict(X_test)
MO_LR_pred = MO_LR_clf.predict(X_test)
MO_KNR_pred = MO_KNR_clf.predict(X_test)

# Score predictions
MO_RFR_score = MO_RFR_clf.score(X_train, y_train)
MO_LR_score = MO_LR_clf.score(X_train, y_train)
MO_KNR_score = MO_KNR_clf.score(X_train, y_train)

In [11]:
# Print accuracy and score of algorithms
print(f'RFR Score: ', MO_RFR_score)
print()
print(f'LR Score: ', MO_LR_score)
print()
print(f'KNR Score: ', MO_KNR_score)

RFR Score:  0.27617926908451157

LR Score:  0.3446174051495607

KNR Score:  0.5981366101688861


In [12]:
predictions = MO_RFR_clf.predict(X_pred).round()

uncomp_games[['score_home','score_away']] = predictions

In [13]:
pd.set_option('display.max_columns', None)
uncomp_games.groupby("schedule_week").get_group(10)[['team_home', 'team_away', 'score_home', 'score_away']]

Unnamed: 0,team_home,team_away,score_home,score_away
136,Carolina Panthers,Atlanta Falcons,27.0,29.0
137,Buffalo Bills,Minnesota Vikings,25.0,23.0
138,Chicago Bears,Detroit Lions,25.0,27.0
139,Green Bay Packers,Dallas Cowboys,18.0,23.0
140,Kansas City Chiefs,Jacksonville Jaguars,30.0,27.0
141,Las Vegas Raiders,Indianapolis Colts,20.0,20.0
142,Los Angeles Rams,Arizona Cardinals,24.0,22.0
143,Miami Dolphins,Cleveland Browns,30.0,26.0
144,New York Giants,Houston Texans,24.0,16.0
145,Pittsburgh Steelers,New Orleans Saints,24.0,24.0
