In [353]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [354]:
df = pd.read_csv('data.csv')

In [355]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [356]:
# Drop scores for uncompleted games (na values)
uncomp_games = uncomp_games.drop(['score_home', 'score_away'], axis=1)

In [357]:
# Display training data
comp_games

Unnamed: 0,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,off_red_per,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,off_red_per_away
0,1,False,Los Angeles Rams,10.0,31.0,Buffalo Bills,3,5,0,0.375,131,173,-42,-5.3,1.0,-4.3,-4.0,-0.3,0.5000,6,2,0,0.750,220,118,102,12.8,1.6,14.3,6.2,8.2,0.5357
1,1,False,Arizona Cardinals,21.0,44.0,Kansas City Chiefs,3,6,0,0.333,203,241,-38,-4.2,0.1,-4.1,-1.0,-3.1,0.5769,6,2,0,0.750,243,189,54,6.8,-0.6,6.1,9.3,-3.2,0.7353
2,1,False,Atlanta Falcons,26.0,27.0,New Orleans Saints,4,6,0,0.400,232,250,-18,-1.8,-1.4,-3.2,0.4,-3.6,0.6207,3,6,0,0.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.5714
3,1,False,Carolina Panthers,24.0,26.0,Cleveland Browns,3,7,0,0.300,204,243,-39,-3.9,-1.2,-5.1,-1.9,-3.2,0.5455,3,5,0,0.375,200,199,1,0.1,0.2,0.3,3.0,-2.7,0.6129
4,1,False,Chicago Bears,19.0,10.0,San Francisco 49ers,3,6,0,0.333,187,216,-29,-3.2,0.7,-2.5,0.7,-3.2,0.5517,4,4,0,0.500,176,147,29,3.6,-1.7,2.0,-1.2,3.2,0.5769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,8,False,New Orleans Saints,24.0,0.0,Las Vegas Raiders,3,6,0,0.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.5714,2,6,0,0.250,183,201,-18,-2.3,-1.9,-4.2,-0.1,-4.0,0.5313
119,8,False,New York Jets,17.0,22.0,New England Patriots,6,3,0,0.667,196,176,20,2.2,2.6,4.8,1.8,2.9,0.6000,5,4,0,0.556,203,166,37,4.1,-0.2,3.9,-0.1,4.0,0.5000
120,8,False,Philadelphia Eagles,35.0,13.0,Pittsburgh Steelers,8,0,0,1.000,225,135,90,11.3,-2.0,9.3,5.5,3.8,0.7000,2,6,0,0.250,120,197,-77,-9.6,5.4,-4.2,-3.8,-0.3,0.4762
121,8,False,Seattle Seahawks,27.0,13.0,New York Giants,6,3,0,0.667,241,220,21,2.3,-2.3,0.0,3.6,-3.6,0.4828,6,2,0,0.750,163,157,6,0.8,0.2,1.0,-1.8,2.8,0.4615


In [358]:
# Set X and Y
X = comp_games.drop(['score_home', 'score_away'], 1)
X_pred = uncomp_games
y = comp_games[comp_games.columns[3:5]]

# Scale data for more accurate predictions
from sklearn.preprocessing import scale
cols = [comp_games.columns[6:]]
for col in cols:
    X[col] = scale(X[col])

# Standardize data for predictions    
cols2 = [uncomp_games.columns[4:]]
for col in cols2:
    X_pred[col] = scale(X_pred[col])

  X = comp_games.drop(['score_home', 'score_away'], 1)


In [359]:
y.shape

(123, 2)

In [360]:
# Columnize any strings in dataset
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.items():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output

X = preprocess_features(X)
X_pred = preprocess_features(X_pred)

In [361]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [362]:
# Initialize algorithms
RFR = RandomForestRegressor(max_depth=2, random_state=0)
LR = LinearRegression()
KNR = KNeighborsRegressor(n_neighbors=2)
MO_RFR_clf = MultiOutputRegressor(estimator=RFR)
MO_LR_clf = MultiOutputRegressor(estimator=LR)
MO_KNR_clf = MultiOutputRegressor(estimator=KNR)

# Fit algorithms to training data
MO_RFR_clf.fit(X_train, y_train)
MO_LR_clf.fit(X_train, y_train)
MO_KNR_clf.fit(X_train, y_train)

# Predict using algorithms
MO_RFR_pred = MO_RFR_clf.predict(X_test)
MO_LR_pred = MO_LR_clf.predict(X_test)
MO_KNR_pred = MO_KNR_clf.predict(X_test)

# Score predictions
MO_RFR_score = MO_RFR_clf.score(X_train, y_train)
MO_LR_score = MO_LR_clf.score(X_train, y_train)
MO_KNR_score = MO_KNR_clf.score(X_train, y_train)

In [363]:
# Print accuracy and score of algorithms
print(f'RFR Score: ', MO_RFR_score)
print()
print(f'LR Score: ', MO_LR_score)
print()
print(f'KNR Score: ', MO_KNR_score)

RFR Score:  0.3856197734275504

LR Score:  0.5971299682291861

KNR Score:  0.5135263507552206


In [364]:
predictions = MO_RFR_clf.predict(X_pred).round()

uncomp_games[['score_home','score_away']] = predictions

In [365]:
pd.set_option('display.max_columns', None)
uncomp_games.groupby("schedule_week").get_group(10)[['team_home', 'team_away', 'score_home', 'score_away']]

Unnamed: 0,team_home,team_away,score_home,score_away
136,Carolina Panthers,Atlanta Falcons,26.0,25.0
137,Buffalo Bills,Minnesota Vikings,22.0,22.0
138,Chicago Bears,Detroit Lions,26.0,18.0
139,Green Bay Packers,Dallas Cowboys,19.0,22.0
140,Kansas City Chiefs,Jacksonville Jaguars,25.0,23.0
141,Las Vegas Raiders,Indianapolis Colts,25.0,17.0
142,Los Angeles Rams,Arizona Cardinals,25.0,22.0
143,Miami Dolphins,Cleveland Browns,28.0,22.0
144,New York Giants,Houston Texans,21.0,15.0
145,Pittsburgh Steelers,New Orleans Saints,25.0,24.0
