In [15]:
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [16]:
df = pd.read_csv('data.csv')

In [17]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [18]:
# Determine winner and loser for completed games
for index, row in comp_games.iterrows():
    if comp_games.loc[index, 'score_home'] > comp_games.loc[index, 'score_away']:
        comp_games.loc[index, 'Home_Winner'] = 1
    else:
        comp_games.loc[index, 'Home_Winner'] = 0
    # else:
    #     comp_games.at[index, 'Winner'] = "T"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.loc[index, 'Home_Winner'] = 0


In [19]:
# Remove scores (no cheating)
comp_games = comp_games.drop(['score_home', 'score_away'], axis=1)
uncomp_games = uncomp_games.drop(['score_home', 'score_away'], axis=1)

In [20]:
# Display training data
comp_games.groupby('Season').get_group(2022)

Unnamed: 0,schedule_week,schedule_playoff,team_home,team_away,Season,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,Season_away,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,Home_Winner
0,1,False,Los Angeles Rams,Buffalo Bills,2022,3,5,0,0.375,131,173,-42,-5.3,1.0,-4.3,-4.0,-0.3,2022,6,2,0,0.750,220,118,102,12.8,1.6,14.3,6.2,8.2,0.0
1,1,False,Arizona Cardinals,Kansas City Chiefs,2022,3,6,0,0.333,203,241,-38,-4.2,0.1,-4.1,-1.0,-3.1,2022,6,2,0,0.750,243,189,54,6.8,-0.6,6.1,9.3,-3.2,0.0
2,1,False,Atlanta Falcons,New Orleans Saints,2022,4,6,0,0.400,232,250,-18,-1.8,-1.4,-3.2,0.4,-3.6,2022,3,6,0,0.333,212,227,-15,-1.7,-0.6,-2.3,0.7,-3.0,0.0
3,1,False,Carolina Panthers,Cleveland Browns,2022,3,7,0,0.300,204,243,-39,-3.9,-1.2,-5.1,-1.9,-3.2,2022,3,5,0,0.375,200,199,1,0.1,0.2,0.3,3.0,-2.7,0.0
4,1,False,Chicago Bears,San Francisco 49ers,2022,3,6,0,0.333,187,216,-29,-3.2,0.7,-2.5,0.7,-3.2,2022,4,4,0,0.500,176,147,29,3.6,-1.7,2.0,-1.2,3.2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,9,False,New England Patriots,Indianapolis Colts,2022,5,4,0,0.556,203,166,37,4.1,-0.2,3.9,-0.1,4.0,2022,3,5,1,0.389,132,183,-51,-5.7,-0.9,-6.6,-6.8,0.1,1.0
132,9,False,New York Jets,Buffalo Bills,2022,6,3,0,0.667,196,176,20,2.2,2.6,4.8,1.8,2.9,2022,6,2,0,0.750,220,118,102,12.8,1.6,14.3,6.2,8.2,1.0
133,9,False,Tampa Bay Buccaneers,Los Angeles Rams,2022,4,5,0,0.444,162,164,-2,-0.2,-0.3,-0.5,-4.6,4.1,2022,3,5,0,0.375,131,173,-42,-5.3,1.0,-4.3,-4.0,-0.3,1.0
134,9,False,Washington Commanders,Minnesota Vikings,2022,4,5,0,0.444,159,192,-33,-3.7,-0.1,-3.7,-4.0,0.3,2022,7,1,0,0.875,193,161,32,4.0,-0.8,3.2,0.9,2.3,0.0


In [21]:
# Set X and Y
X = comp_games.drop(['Home_Winner'], 1)
X_pred = uncomp_games
y = comp_games['Home_Winner']

# Scale data for more accurate predictions
from sklearn.preprocessing import scale
cols = [comp_games.columns[4:-1]]
for col in cols:
    X[col] = scale(X[col])
cols2 = [uncomp_games.columns[4:]]
for col in cols2:
    X_pred[col] = scale(X_pred[col])

  X = comp_games.drop(['Home_Winner'], 1)


In [22]:
# Columnize any strings in dataset
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.items():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output

X = preprocess_features(X)
X_pred = preprocess_features(X_pred)

In [23]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [24]:
# Initialize algorithms
LR_clf = LogisticRegression()
SVC_clf = SVC()
xgb_clf = xgb.XGBClassifier(seed = 1)

# Fit algorithms to training data
LR_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Predict using algorithms
LR_pred = LR_clf.predict(X_test)
SVC_pred = SVC_clf.predict(X_test)
XGB_pred = xgb_clf.predict(X_test)

# Cross validate algorithms
LR_score = cross_val_score(LR_clf, X, y)
SVC_score = cross_val_score(SVC_clf, X, y)
XGB_score = cross_val_score(xgb_clf, X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [25]:
# Print accuracy and score of algorithms
print(f'Logistic Regression:', '\nAccuracy:', accuracy_score(y_test, LR_pred), '\nF1 Score: ', f1_score(y_test, LR_pred), '\nCross Validation Score:', LR_score.mean())
print()
print(f'SVC:', '\nAccuracy:', accuracy_score(y_test, SVC_pred), '\nF1 Score: ', f1_score(y_test, SVC_pred), '\nCross Validation Score: ', SVC_score.mean())
print()
print(f'XGBoost:', '\nAccuracy:', accuracy_score(y_test, XGB_pred), '\nF1 Score: ', f1_score(y_test, XGB_pred), '\nCross Validation Score: ', XGB_score.mean())

Logistic Regression: 
Accuracy: 0.6630434782608695 
F1 Score:  0.6666666666666666 
Cross Validation Score: 0.6902173913043479

SVC: 
Accuracy: 0.6956521739130435 
F1 Score:  0.6956521739130435 
Cross Validation Score:  0.717391304347826

XGBoost: 
Accuracy: 0.6739130434782609 
F1 Score:  0.6808510638297872 
Cross Validation Score:  0.6684782608695652


In [26]:
SVC_clf.fit(X, y)
predictions = SVC_clf.predict(X_pred)

uncomp_games['Home_Winner'] = predictions

In [27]:
# Show predictions for week 9
pd.set_option('display.max_columns', None)
uncomp_games.groupby("schedule_week").get_group(10)[['team_home', 'team_away', 'Home_Winner']]

Unnamed: 0,team_home,team_away,Home_Winner
150,Green Bay Packers,Tennessee Titans,0.0
151,Atlanta Falcons,Chicago Bears,0.0
152,Baltimore Ravens,Carolina Panthers,1.0
153,Buffalo Bills,Cleveland Browns,1.0
154,Denver Broncos,Las Vegas Raiders,1.0
155,Houston Texans,Washington Commanders,0.0
156,Indianapolis Colts,Philadelphia Eagles,0.0
157,Los Angeles Chargers,Kansas City Chiefs,0.0
158,Minnesota Vikings,Dallas Cowboys,1.0
159,New England Patriots,New York Jets,0.0
