In [205]:
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [206]:
df = pd.read_csv('data.csv')

In [207]:
# Get completed games
comp_games = df[df['score_home'].notna()]
# Get uncompleted games
uncomp_games = df[df['score_home'].isna()]

In [208]:
# Determine winner and loser for completed games
for index, row in comp_games.iterrows():
    if comp_games.loc[index, 'score_home'] > comp_games.loc[index, 'score_away']:
        comp_games.loc[index, 'Home_Winner'] = 1
    else:
        comp_games.loc[index, 'Home_Winner'] = 0
    # else:
    #     comp_games.at[index, 'Winner'] = "T"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_games.loc[index, 'Home_Winner'] = 0


In [209]:
# Remove scores (no cheating)
comp_games = comp_games.drop(['score_home', 'score_away'], axis=1)
uncomp_games = uncomp_games.drop(['score_home', 'score_away'], axis=1)

In [210]:
# Display training data
comp_games.groupby('Season').get_group(2022)

Unnamed: 0,schedule_week,schedule_playoff,team_home,team_away,Season,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,Season_away,W_away,L_away,T_away,W-L%_away,PF_away,PA_away,PD_away,MoV_away,SoS_away,SRS_away,OSRS_away,DSRS_away,Home_Winner
0,1,False,Los Angeles Rams,Buffalo Bills,2022,5,12,0,0.294,307,384,-77,-4.5,0.5,-4.0,-4.1,0.0,2022,13,3,0,0.813,455,286,169,10.6,0.4,10.9,7.1,3.8,0.0
1,1,False,Arizona Cardinals,Kansas City Chiefs,2022,4,13,0,0.235,340,449,-109,-6.4,0.2,-6.2,-1.9,-4.3,2022,14,3,0,0.824,496,369,127,7.5,-1.2,6.2,6.8,-0.6,0.0
2,1,False,Atlanta Falcons,New Orleans Saints,2022,7,10,0,0.412,365,386,-21,-1.2,-0.9,-2.1,-0.1,-2.0,2022,7,10,0,0.412,330,345,-15,-0.9,-0.3,-1.2,-2.8,1.7,0.0
3,1,False,Carolina Panthers,Cleveland Browns,2022,7,10,0,0.412,347,374,-27,-1.6,-0.6,-2.2,-1.3,-0.9,2022,7,10,0,0.412,361,381,-20,-1.2,1.1,-0.1,0.7,-0.9,0.0
4,1,False,Chicago Bears,San Francisco 49ers,2022,3,14,0,0.176,326,463,-137,-8.1,1.6,-6.4,-2.5,-4.0,2022,13,4,0,0.765,450,277,173,10.2,-2.3,7.9,3.3,4.6,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,18,False,Philadelphia Eagles,New York Giants,2022,14,3,0,0.824,477,344,133,7.8,-1.3,6.5,5.8,0.7,2022,9,7,1,0.559,365,371,-6,-0.4,0.0,-0.4,-0.8,0.4,1.0
267,18,False,Pittsburgh Steelers,Cleveland Browns,2022,9,8,0,0.529,308,346,-38,-2.2,1.5,-0.8,-3.0,2.3,2022,7,10,0,0.412,361,381,-20,-1.2,1.1,-0.1,0.7,-0.9,1.0
268,18,False,San Francisco 49ers,Arizona Cardinals,2022,13,4,0,0.765,450,277,173,10.2,-2.3,7.9,3.3,4.6,2022,4,13,0,0.235,340,449,-109,-6.4,0.2,-6.2,-1.9,-4.3,1.0
269,18,False,Seattle Seahawks,Los Angeles Rams,2022,9,8,0,0.529,407,401,6,0.4,-0.8,-0.5,1.9,-2.4,2022,5,12,0,0.294,307,384,-77,-4.5,0.5,-4.0,-4.1,0.0,1.0


In [211]:
# Set X and Y
X = comp_games.drop(['Home_Winner'], 1)
X_pred = uncomp_games
y = comp_games['Home_Winner']

# Scale data for more accurate predictions
from sklearn.preprocessing import scale
cols = [comp_games.columns[4:-1]]
for col in cols:
    X[col] = scale(X[col])
cols2 = [uncomp_games.columns[4:]]
for col in cols2:
    X_pred[col] = scale(X_pred[col])

  X = comp_games.drop(['Home_Winner'], 1)


In [212]:
# Columnize any strings in dataset
def preprocess_features(X):
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.items():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output

X = preprocess_features(X)
X_pred = preprocess_features(X_pred)

In [213]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [214]:
# Initialize algorithms
LR_clf = LogisticRegression()
SVC_clf = SVC()
xgb_clf = xgb.XGBClassifier(seed = 1)

# Fit algorithms to training data
LR_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Predict using algorithms
LR_pred = LR_clf.predict(X_test)
SVC_pred = SVC_clf.predict(X_test)
XGB_pred = xgb_clf.predict(X_test)

# Cross validate algorithms
LR_score = cross_val_score(LR_clf, X, y)
SVC_score = cross_val_score(SVC_clf, X, y)
XGB_score = cross_val_score(xgb_clf, X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [215]:
# Print accuracy and score of algorithms
print(f'Logistic Regression:', '\nAccuracy:', accuracy_score(y_test, LR_pred), '\nF1 Score: ', f1_score(y_test, LR_pred), '\nCross Validation Score:', LR_score.mean())
print()
print(f'SVC:', '\nAccuracy:', accuracy_score(y_test, SVC_pred), '\nF1 Score: ', f1_score(y_test, SVC_pred), '\nCross Validation Score: ', SVC_score.mean())
print()
print(f'XGBoost:', '\nAccuracy:', accuracy_score(y_test, XGB_pred), '\nF1 Score: ', f1_score(y_test, XGB_pred), '\nCross Validation Score: ', XGB_score.mean())

Logistic Regression: 
Accuracy: 0.7075471698113207 
F1 Score:  0.6990291262135923 
Cross Validation Score: 0.7061611374407584

SVC: 
Accuracy: 0.6886792452830188 
F1 Score:  0.6597938144329897 
Cross Validation Score:  0.7118483412322275

XGBoost: 
Accuracy: 0.660377358490566 
F1 Score:  0.625 
Cross Validation Score:  0.6464454976303318


In [216]:
SVC_clf.fit(X, y)
predictions = SVC_clf.predict(X_pred)

uncomp_games['Home_Winner'] = predictions

In [217]:
# Show predictions for week 9
pd.set_option('display.max_columns', None)
uncomp_games.groupby("schedule_week").get_group(19)[['team_home', 'team_away', 'Home_Winner']]

Unnamed: 0,team_home,team_away,Home_Winner
289,San Francisco 49ers,Seattle Seahawks,1.0
290,Jacksonville Jaguars,Los Angeles Chargers,1.0
291,Buffalo Bills,Miami Dolphins,1.0
292,New York Giants,Minnesota Vikings,0.0
293,Cincinnati Bengals,Baltimore Ravens,1.0
294,Tampa Bay Buccaneers,Dallas Cowboys,0.0
