In [32]:
#Running all files through a randomforest model multiple times to find the mean accuracy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os

file_names = [
    'FeildGoalStatsSpecialTeams.csv', 'PassingStatsDefense.csv', 'RushingStatsDefense.csv', 
    'RecievingStatsDefense.csv', 'PassingStatsOffense.csv', 'InterceptionStatsDefense.csv',
    'KickoffReturnsStatsSpecialTeams.csv', 'PuntReturnsStatsSpecialTeams.csv',
    'RushingStatsOffense.csv', 'RecievingStatsOffense.csv',
    'PuntingStatsSpecialTeams.csv', 'KickoffsSpecialTeams.csv', 
    'ScoringStatsSpecialTeams.csv', 'DownsStatsOffense.csv', 'ScoringStatsOffense.csv',
    'FumbleStatsDefense.csv', 'DownsStatsDefense.csv', 'TackleStatsDefense.csv',
    'ScoringStatsDefense.csv', 'offense_merged_stats.csv', 'defense_merged_stats.csv', 
    'specialteams_merged_stats.csv', 'combined_all_stats.csv']

input_base_path = '/Users/charlesmorgan/Desktop/Merged Data/'

for file in file_names:
    stats_filepath = os.path.join(input_base_path, file)
    merged_data = pd.read_csv(stats_filepath)

    columns_to_drop = ['Week', 'Away Team', 'Home Team', 'Winning Team']
    merged_data.drop(columns_to_drop, axis=1, inplace=True)

    X = merged_data.drop('Home_Win', axis=1)
    y = merged_data['Home_Win']

    accuracies = []

    for _ in range(40):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=None)

        model = RandomForestClassifier(n_estimators=100)
        model.fit(X_train, y_train)

        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        accuracies.append(accuracy)

    average_accuracy = np.mean(accuracies)
    std_deviation = np.std(accuracies)

    print(f'Average Accuracy for {file}: {average_accuracy:.2f} Standard Deviation:  {std_deviation:.2f}')

Average Accuracy for FeildGoalStatsSpecialTeams.csv: 0.57 Standard Deviation:  0.05
Average Accuracy for PassingStatsDefense.csv: 0.56 Standard Deviation:  0.05
Average Accuracy for RushingStatsDefense.csv: 0.59 Standard Deviation:  0.05
Average Accuracy for RecievingStatsDefense.csv: 0.59 Standard Deviation:  0.05
Average Accuracy for PassingStatsOffense.csv: 0.60 Standard Deviation:  0.04
Average Accuracy for InterceptionStatsDefense.csv: 0.56 Standard Deviation:  0.05
Average Accuracy for KickoffReturnsStatsSpecialTeams.csv: 0.51 Standard Deviation:  0.05
Average Accuracy for PuntReturnsStatsSpecialTeams.csv: 0.54 Standard Deviation:  0.05
Average Accuracy for RushingStatsOffense.csv: 0.56 Standard Deviation:  0.05
Average Accuracy for RecievingStatsOffense.csv: 0.60 Standard Deviation:  0.05
Average Accuracy for PuntingStatsSpecialTeams.csv: 0.56 Standard Deviation:  0.05
Average Accuracy for KickoffsSpecialTeams.csv: 0.59 Standard Deviation:  0.05
Average Accuracy for ScoringStats

In [33]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="keras")

In [36]:
#Running combined files through multiple different ml algs to find the optimal one. Finding the optimal features
#used in each model.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import Dense

def analyze_dataset(data_path):
    data = pd.read_csv(data_path)
    columns_to_drop = ['Week', 'Away Team', 'Home Team', 'Winning Team']
    X = data.drop(columns_to_drop + ['Home_Win'], axis=1)
    y = data['Home_Win']
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

    #scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #SVM
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train_scaled, y_train)
    svm_predictions = svm_model.predict(X_test_scaled)
    
    #Log
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train_scaled, y_train)
    logistic_predictions = logistic_model.predict(X_test_scaled)
    
    #Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    
    #Gradient Boosting
    gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    gb_predictions = gb_model.predict(X_test)
    
    #S Neural Network
    nn_model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)
    nn_probabilities = nn_model.predict(X_test_scaled)
    nn_predictions = (nn_probabilities > 0.5).astype(int)
    
    print(f"Results for {data_path.split('/')[-1]}:")
    print('')
    print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
    print("Logistic Regression Accuracy:", accuracy_score(y_test, logistic_predictions))
    print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
    print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_predictions))
    print("Neural Network Accuracy:", accuracy_score(y_test, nn_predictions))
    print('')
    #print("\nClassification Reports:")
    #print("Random Forest Classification Report:")
    #print(classification_report(y_test, rf_predictions))
    #print("Gradient Boosting Classification Report:")
    #print(classification_report(y_test, gb_predictions))
    
    if data_path == '/Users/charlesmorgan/Desktop/Merged Data/combined_all_stats.csv':
        print("\nTop 3 Features:")

        if hasattr(svm_model, 'coef_'):
            svm_importances = pd.Series(abs(svm_model.coef_[0]), index=X.columns).sort_values(ascending=False)
            print("SVM:")
            print(svm_importances.head(3))

        if hasattr(logistic_model, 'coef_'):
            logistic_importances = pd.Series(abs(logistic_model.coef_[0]), index=X.columns).sort_values(ascending=False)
            print("Logistic Regression:")
            print(logistic_importances.head(3))

        rf_feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
        print("Random Forest:")
        print(rf_feature_importances.head(3))

        gb_feature_importances = pd.Series(gb_model.feature_importances_, index=X.columns).sort_values(ascending=False)
        print("Gradient Boosting:")
        print(gb_feature_importances.head(3))


datasets = [
    '/Users/charlesmorgan/Desktop/Merged Data/offense_merged_stats.csv',
    '/Users/charlesmorgan/Desktop/Merged Data/defense_merged_stats.csv',
    '/Users/charlesmorgan/Desktop/Merged Data/specialteams_merged_stats.csv',
    '/Users/charlesmorgan/Desktop/Merged Data/combined_all_stats.csv'
]

for dataset in datasets:
    analyze_dataset(dataset)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Results for offense_merged_stats.csv:

SVM Accuracy: 0.6617647058823529
Logistic Regression Accuracy: 0.6617647058823529
Random Forest Accuracy: 0.6911764705882353
Gradient Boosting Accuracy: 0.6911764705882353
Neural Network Accuracy: 0.6617647058823529

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Results for defense_merged_stats.csv:

SVM Accuracy: 0.6764705882352942
Logistic Regression Accuracy: 0.6764705882352942
Random Forest Accuracy: 0.6764705882352942
Gradient Boosting Accuracy: 0.6617647058823529
Neural Network Accuracy: 0.7205882352941176

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Results for specialteams_merged_stats.csv:

SVM Accuracy: 0.6617647058823529
Logistic Regression Accuracy: 0.6176470588235294
Random Forest Accuracy: 0.6617647058823529
Gradient Boosting Accuracy: 0.6323529411764706
Neural Network Accuracy: 0.6029411764705882

[1m3/3[0m

In [26]:
#Attempting a grid search to find optimal parameters
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data_path = '/Users/charlesmorgan/Desktop/Merged Data/combined_all_stats.csv'
data = pd.read_csv(data_path)

columns_to_drop = ['Week', 'Away Team', 'Home Team', 'Winning Team']

X = data.drop(columns_to_drop + ['Home_Win'], axis=1)
y = data['Home_Win']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

#GridSearch
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 75, 100, 125],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=7, verbose=1, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

feature_importances = best_rf.feature_importances_
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
    
print("\nTop 10 Features by Importance:")
print(features.head(10))
print()

predictions = best_rf.predict(X_test)

print("Best model parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Fitting 7 folds for each of 216 candidates, totalling 1512 fits

Top 10 Features by Importance:
                              Feature  Importance
155           specialteams_ki_KO_home    0.017782
219               defense_pa_Att_away    0.015135
114  specialteams_fe_40-49 > A-M_home    0.012335
130        specialteams_pu_Avg_x_home    0.010375
272               defense_ta_Sck_away    0.009844
87                 defense_fu_FF_home    0.009748
37            offense_do_4th Att_home    0.009676
28                 offense_re_TD_home    0.009488
200                offense_re_20_away    0.009299
314        specialteams_pu_Avg_y_away    0.008974

Best model parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Test Accuracy: 0.6911764705882353
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.60      0.63        30
           1       0.71      0.76      0.73        

In [45]:
#Eliminate unimportant features
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def analyze_dataset(data_path):
    data = pd.read_csv(data_path)
    
    columns_to_drop = ['Week', 'Away Team', 'Home Team', 'Winning Team']
    
    X = data.drop(columns_to_drop + ['Home_Win'], axis=1)
    y = data['Home_Win']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    '''
    rf_model = RandomForestClassifier(
        n_estimators=50,         
        max_depth=None,           
        max_features='auto',      
        min_samples_leaf=4,       
        min_samples_split=10,      
        random_state=42           
    )
    '''
    
    rf_model.fit(X_train, y_train)
    
    initial_predictions = rf_model.predict(X_test)
    initial_accuracy = accuracy_score(y_test, initial_predictions)
    print(f"Initial Random Forest Accuracy: {initial_accuracy:.2f}")

    feature_importances = rf_model.feature_importances_
    features = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    print("\nTop 10 Features:")
    print(features.head(10))
    print('')
    
    important_accuracies = []

    for x in range(20):
        n = 190 
        important_features = features.head(n)['Feature']

        X_train_important = X_train[important_features]
        X_test_important = X_test[important_features]

        rf_model.fit(X_train_important, y_train)
        important_predictions = rf_model.predict(X_test_important)
        important_accuracy = accuracy_score(y_test, important_predictions)
        important_accuracies.append(important_accuracy)

    average_accuracy = np.mean(important_accuracies)
    std_deviation = np.std(important_accuracies)

    print(f"Improved Random Forest Accuracy with top {n} features: {average_accuracy:.2f} Std Dev: {std_deviation:.2f}")
    print('')
    print("Random Forest Classification Report with Important Features:")
    print(classification_report(y_test, important_predictions))
    
datasets = [
    '/Users/charlesmorgan/Desktop/Merged Data/combined_all_stats.csv'
]

for dataset in datasets:
    analyze_dataset(dataset)

Initial Random Forest Accuracy: 0.71

Top 10 Features:
                        Feature  Importance
155     specialteams_ki_KO_home    0.010610
62          defense_ru_Att_home    0.009325
156  specialteams_ki_Yds_y_home    0.008343
219         defense_pa_Att_away    0.008120
13          offense_pa_Sck_home    0.007388
37      offense_do_4th Att_home    0.007249
36       offense_do_3rd Md_home    0.006739
46       offense_sc_Tot TD_home    0.006732
9          offense_pa_1st%_home    0.006721
39      offense_do_Rec 1st_home    0.006434

Improved Random Forest Accuracy with top 190 features: 0.75 Std Dev: 0.00

Random Forest Classification Report with Important Features:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71        30
           1       0.77      0.79      0.78        38

    accuracy                           0.75        68
   macro avg       0.75      0.74      0.75        68
weighted avg       0.75      0.75      0.75        6

In [46]:
#Cross validation to show how well it would actually perform on outside data.
from sklearn.model_selection import StratifiedKFold

def analyze_dataset(data_path):
    data = pd.read_csv(data_path)
    
    columns_to_drop = ['Week', 'Away Team', 'Home Team', 'Winning Team']
    X = data.drop(columns_to_drop + ['Home_Win'], axis=1)
    y = data['Home_Win']
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(rf_model, X, y, cv=cv, scoring='accuracy')
    
    print(f"Cross-Validation Accuracy: {np.mean(cross_val_scores):.2f} ± {np.std(cross_val_scores):.2f}")
    
    rf_model.fit(X, y)
    feature_importances = rf_model.feature_importances_
    features = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nTop 10 Features:")
    print(features.head(10))
    print('')
    
    n = 100 
    important_features = features.head(n)['Feature']
    
    X_important = X[important_features]
    cross_val_scores_important = cross_val_score(rf_model, X_important, y, cv=cv, scoring='accuracy')
    
    print(f"Cross-Validation Accuracy with Top {n} Features: {np.mean(cross_val_scores_important):.2f} ± {np.std(cross_val_scores_important):.2f}")
    print('')

datasets = [
    '/Users/charlesmorgan/Desktop/Merged Data/combined_all_stats.csv'
]

for dataset in datasets:
    analyze_dataset(dataset)


Cross-Validation Accuracy: 0.61 ± 0.04

Top 10 Features:
                        Feature  Importance
233         defense_ru_Att_away    0.012323
178        offense_pa_Rate_away    0.008906
155     specialteams_ki_KO_home    0.008879
327  specialteams_ki_Yds_y_away    0.008686
37      offense_do_4th Att_home    0.007656
45       offense_sc_Rec TD_home    0.007420
326     specialteams_ki_KO_away    0.007368
40     offense_do_Rec 1st%_home    0.006902
219         defense_pa_Att_away    0.006835
197         offense_re_Yds_away    0.006669

Cross-Validation Accuracy with Top 100 Features: 0.64 ± 0.06

