In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import graphviz
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from sklearn.inspection import permutation_importance
import eli5
from eli5.sklearn import PermutationImportance


In [None]:
df = pd.read_csv('combinedData.csv')

# Team Matching

Teams that changed their name

In [None]:
team_match = {
    'CON':'ORL',
    'TUL':'DET',
    'SAS':'UTA'
}

team_match_encoded = {
    4:12,
    20:5,
    16:18
}

# Feature Elimination

In [None]:
df = df.drop(columns=['results', 'seed', 'college', 'collegeOther', 'birthDate', 'win_ratio', 'stint', 
                      'GS', 'GP', 'rebounds', 'fgAttempted', 'ftAttempted','threeAttempted',
                      'minutes', 'pointsFromFieldGoal', 
                      'pos', 'age',
                      'award', 'coachAward',
                      'ftMade', 'topg', 'percentage_pointsFromThree', ])

# Encoding

Encoding categorical features and id features

In [None]:
id_features = ['tmID', 'playerID', 'coachID']
categorical_features = ['confID']
df = pd.get_dummies(df, columns=categorical_features)

df

In [None]:
label_encoder = LabelEncoder()

for feature in id_features:
    df[f'{feature}_encoded'] = label_encoder.fit_transform(df[feature])

df['playoff'] = df['playoff'].replace({'N': 0, 'Y': 1})


new_id_features = id_features.copy()
new_id_features.append('year')

# map each tmID and its encoded value

id_df = df['tmID'].copy()

id_df = id_df.reset_index()
id_df = id_df.drop(columns=['index'])
id_df = id_df.drop_duplicates()
id_df = id_df.reset_index()

# array of tmIDs
tmIDs = id_df['tmID'].to_numpy()
print(tmIDs)

tmIDS_inverted = {tmid: n for n, tmid in enumerate(tmIDs)}
print(tmIDS_inverted)


In [None]:
df

In [None]:
# map each playerID and its encoded value
player_enc_df = df[['playerID', 'playerID_encoded']].copy()
player_enc_df = player_enc_df.reset_index()
player_enc_df = player_enc_df.drop(columns=['index'])
player_enc_df = player_enc_df.drop_duplicates()
player_enc_df = player_enc_df.reset_index()

player_enc_dict = player_enc_df.set_index('playerID_encoded').T.to_dict('list')
player_enc_dict = {k: v[1] for k, v in player_enc_dict.items()}
print(player_enc_dict)

#invert the map
player_enc_dict_inverted = {v: k for k, v in player_enc_dict.items()}
print(player_enc_dict_inverted)

In [None]:
# map each coachID and its encoded value
coach_enc_df = df[['coachID', 'coachID_encoded']].copy()
coach_enc_df = coach_enc_df.reset_index()
coach_enc_df = coach_enc_df.drop(columns=['index'])
coach_enc_df = coach_enc_df.drop_duplicates()
coach_enc_df = coach_enc_df.reset_index()

coach_enc_dict = coach_enc_df.set_index('coachID_encoded').T.to_dict('list')
coach_enc_dict = {k: v[1] for k, v in coach_enc_dict.items()}
print(coach_enc_dict)

# invert the map
coach_enc_dict_inverted = {v: k for k, v in coach_enc_dict.items()}
print(coach_enc_dict_inverted)


In [None]:
# drop the original not encoded id features
df = df.drop(columns=id_features)

In [None]:
feature_cols = [col for col in df.columns if col not in ['playoff']]
print(feature_cols)

# Data Leakage

features not to replace with last years:
 - tmID
 - year
 - playerID
 - stint
 - pos
 - height
 - weight
 - college
 - college other
 - birthDate
 - age
 - coachId
 - coachStint
 - confID

In [None]:
features_to_replace = [col for col in feature_cols if col not in [
    'tmID', 'year', 'playerID', 'stint', 'coachID', 'coachStint', 'confID',
    'tmID_encoded', 'playerID_encoded', 'coachID_encoded','confID_EA','confID_WE']]

print(features_to_replace)

In [None]:
#debug
# df.to_csv('featureRemoval.csv')
# df

# Format Final Year Results

So that it matches the training data format

In [None]:
def create_predictions_df(data, playoffs, certainties):
    data = data.reset_index(drop=True)
    # dataFrame for the playoffs and certainties
    playoffs_df = pd.DataFrame({
        'playoff': playoffs,
        'certainty': certainties[:, 1] * 100
    })

    # changing the tmID and playerID to their original values based on the id_df
    data['tmID'] = data['tmID_encoded'].apply(lambda x: tmIDs[x])

    # concat the data and the playoffs_df
    result = pd.concat([data[['tmID', 'confID_EA']], playoffs_df], axis=1)
    
        
    # for every team, find the average certainty and return the teamID and the average certainty and the playoff
    result = result.groupby(['tmID']).mean()
    result = result.reset_index()

    result['playoff'] = result['certainty'].apply(lambda x: 1 if x >= 50 else 0)

    result = result.sort_values(by=['confID_EA'], ascending=False)
    return result

# Year 11 Prep

Encoding of year 11 features, matching previous encoding

In [None]:
year11 = pd.read_csv('dataset11/edited/all.csv')

#tmID,year,coachID,coachStint,confID,playerID,stint,lgID
year11 = year11.drop(columns=['stint', 'lgID']) 

year11 = pd.get_dummies(year11, columns=categorical_features)

# encode the tmID values based on the tmID_inverted map
year11['tmID_encoded'] = year11['tmID'].apply(lambda x: tmIDS_inverted.get(x, 20))

year11['playerID_encoded'] = year11['playerID'].apply(lambda x: player_enc_dict_inverted.get(x, 0))

year11['coachID_encoded'] = year11['coachID'].apply(lambda x: coach_enc_dict_inverted.get(x, 0))

year11 = year11.drop(columns=['tmID', 'playerID', 'coachID'])

year11

Function that predicts the year 11 playoffs, based on a model.
Also calculates the certainty of the prediction, formats the dataframe and saves it to a csv

In [None]:
def predict_playoffs(model, model_name):
    data = year11.copy()
    # data.to_csv('data_original.csv')
    
    replace_data = df.copy()
    # sort replace data by year descending
    replace_data = replace_data.sort_values(by=['year'], ascending=False)
    # replace_data.to_csv('replace_data.csv')
    # replace data with last years data
    key_columns = ['tmID_encoded', 'playerID_encoded']

    for index, current_row in data.iterrows():
        # replace tmIDs that are in the team_match_encoded map
        if current_row[key_columns[0]] in team_match_encoded:
            data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]

        # Check if there's a matching entry in the previous year's dataframe
        matching_entry = replace_data[
            (replace_data[key_columns[0]] == current_row[key_columns[0]]) 
            & (replace_data[key_columns[1]] == current_row[key_columns[1]])
            ]

        if not matching_entry.empty:
            # Replace values in columns_to_replace with data from the matching entry
            for column in features_to_replace:
                data.at[index, column] = matching_entry.iloc[0][column]
        else:
            # Remove the row if there is no matching entry
            print(f'Removing row {index}, row: {current_row}')
            data.drop(index, inplace=True)

    
    # add entry to the data
    # DET,11,lattaiv01w,0,31,1,221,93,4,15,19,20,8,2,16,24,87,34,7,3,49,22,0,7.129032258064516,3.0,0.6451612903225806,0.6129032258064516,0.2580645161290322,0.064516129032258,0.5161290322580645,0.7741935483870968,39.08045977011494,44.89795918367347,42.85714285714285,0.1290322580645161,0.4838709677419355,12.0,12.903225806451612,3.225806451612903,23.655913978494624,G,66.0,143.0,North Carolina,,1984-09-24,39.04,,laimbbi01w,0,24,10,2.4,EA,1,label3,Y,2261,785,442,380,931,1311,563,702,227,564,117,2697,895,2262,547,718,203,610,309,780,1089,546,767,276,529,117,2540,0.706,0.706,0.706,6850,42.99,75.8,35.75,39.57,76.18,33.28,0
    
    new_row = {
        'year': 11,'points':93,'oRebounds':4,'dRebounds':15,'assists':20,'steals':8,'blocks':2,'turnovers':16,'PF':24,'fgMade':34,'threeMade':22,'dq':0,'mpg':7.13,'ppg':3,'apg':0.65,'rpg':0.61,'spg':0.26,'bpg':0.06,'pfpg':0.77,'fg%':39.08,'3p%':44.9,'ft%':42.86,'orpg':0.13,'drpg':0.48,'percentage_pointsFromFieldGoal':12.9,'percentage_pointsFromFreeThrow':3.29,'height':66,'weight':143,'coachStint':0,'coachWon':24,'coachLost':10,'coach W/L Ratio':2.4,'o_fga':2261,'o_fta':785,'o_3pa':442,'o_oreb':380,'o_dreb':931,'o_reb':1311,'o_asts':563,'o_pf':702,'o_stl':227,'o_to':564,'o_blk':117,'o_pts':2697,'d_fgm':895,'d_fga':2262,'d_ftm':547,'d_fta':718,'d_3pm':203,'d_3pa':610,'d_oreb':309,'d_dreb':780,'d_reb':1089,'d_asts':546,'d_pf':767,'d_stl':276,'d_to':529,'d_blk':117,'d_pts':2540,'homeW_ratio':0.71,'awayW_ratio':0.71,'min':6850,'o_fg%':42.99,'o_ft%':75.8,'o_3p%':35.75,'d_fg%':39.57,'d_ft%':76.18,'d_3p%':33.28,'confID_EA':False,'confID_WE':True,'tmID_encoded':5,'playerID_encoded':268,'coachID_encoded':134
    }
    
    # sort the features so that they match df
    data = data[feature_cols]
    
    data = pd.concat([data, pd.DataFrame([new_row])], ignore_index=True)
    # data.to_csv('data.csv')

    # Predict the classes
    predictions = model.predict(data)
    
    # Predict the probabilities
    certainties = model.predict_proba(data)

    result = create_predictions_df(data, predictions, certainties)
    result.to_csv(f'predictions/{model_name}_predictions.csv')
    return result

# Feature Importance Graph

In [None]:
def importance_graph(importances, X_train):
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Rearrange feature names so they match the sorted feature importances
    names = [X_train.columns[i] for i in indices]

    # Create plot
    plt.figure(figsize=(18, 6))

    # Create plot title
    plt.title("Feature Importance")

    # Add bars
    plt.bar(range(X_train.shape[1]), importances[indices])

    # Add feature names as x-axis labels
    plt.xticks(range(X_train.shape[1]), names, rotation=90)

    # Show plot
    plt.show()

# Decision Tree

In [None]:
# start window with the first 5 years of training data
yearsInit = 6

In [None]:
max_depth = [i for i in range(1, 20)]
max_features = [None, 'sqrt', 'log2']

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None

start = time.time()

for depth in max_depth:
    for feature in max_features:
        accuracies = []
        precisions = []
        recalls = []
        f1s = []
        model = DecisionTreeClassifier(max_depth=depth, max_features=feature)

        for i in range(yearsInit, 11):
            train_years = range(1,i)
            test_year = i

            train_data = df[df['year'].isin(train_years)]
            replace_data = df[df['year'] == test_year-1]
            test_data = df[df['year'] == test_year]

            # replace data with last years data
            key_columns = ['tmID_encoded', 'playerID_encoded']
            
            for index, current_row in test_data.iterrows():
                common_key = tuple(current_row[key_columns])

                # replace tmIDs that are in the team_match_encoded dictionary
                if current_row[key_columns[0]] in team_match_encoded:
                    test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]

                # Check if there's a matching entry in the previous year's dataframe
                matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

                if not matching_entry.empty:
                    # Replace values in columns_to_replace with data from the matching entry
                    for column in features_to_replace:
                        test_data.at[index, column] = matching_entry.iloc[0][column]
                # check if the team is in the list of teams that changed names
                elif current_row[key_columns[0]] in team_match_encoded:
                    # replace the teamID with the new teamID
                    test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
                else:
                    # Remove the row if there is no matching entry
                    test_data.drop(index, inplace=True)

            y_train = train_data['playoff']
            y_test = test_data['playoff']

            # Prepare the data for training and testing
            X_train, X_test = train_data[feature_cols], test_data[feature_cols]

            model.fit(X_train, y_train)

            # Make predictions for individual players
            player_predictions = model.predict(X_test)
            prediction_probabilities = model.predict_proba(X_test)
            prediction_probabilities = np.around(prediction_probabilities, decimals=4)
            
            pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)

            # Evaluate the model
            accuracy = accuracy_score(y_test, player_predictions)
            precision = precision_score(y_test, player_predictions)
            recall = recall_score(y_test, player_predictions)
            f1 = recall_score(y_test, player_predictions)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
            
        accuracy_average = round(sum(accuracies)/len(accuracies), 3)
        if accuracy_average > best_accuracy:
            best_accuracy = accuracy_average
            best_model = model
            best_accuracies = accuracies
            best_precisions = precisions
            best_recalls = recalls
            best_f1s = f1s

duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("Decision Tree Metrics Box Plot")

plt.show()


In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#debug
# dot_data = export_graphviz(best_model, out_file=None,
#                           feature_names=X_train.columns, # Specify your feature names
#                           class_names=['No Playoff', 'Playoff'], # Specify your class names
#                           filled=True, rounded=True, special_characters=True)

# graph = graphviz.Source(dot_data)
# graph.render("decision_tree") # This will save the tree as 'decision_tree.pdf'
#graph.view("decision_tree")   # This will open the tree in your default PDF viewer


In [None]:
#Most important attributes
importances = best_model.feature_importances_

importance_graph(importances, X_train)

## Year 11

In [None]:
pred = predict_playoffs(best_model, 'decision_tree')
pred

# Random Forest

In [None]:
max_depth = [i for i in range(1, 20)]
max_features =  [None, 'sqrt', 'log2']

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None

start = time.time()

for depth in max_depth:
    for feature in max_features:
        accuracies = []
        precisions = []
        recalls = []
        f1s = []
        model = RandomForestClassifier(max_depth=depth, max_features=feature)
        for i in range(yearsInit, 11):
            train_years = range(1,i)
            test_year = i

            train_data = df[df['year'].isin(train_years)]
            replace_data = df[df['year'] == test_year-1]
            test_data = df[df['year'] == test_year]

            # replace data with last years data
            key_columns = ['tmID_encoded', 'playerID_encoded']
            
            for index, current_row in test_data.iterrows():
                common_key = tuple(current_row[key_columns])

                # Check if there's a matching entry in the previous year's dataframe
                matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

                if not matching_entry.empty:
                    # Replace values in columns_to_replace with data from the matching entry
                    for column in features_to_replace:
                        test_data.at[index, column] = matching_entry.iloc[0][column]
                # check if the team is in the list of teams that changed names
                elif current_row[key_columns[0]] in team_match_encoded:
                    # replace the teamID with the new teamID
                    test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
                else:
                    # Remove the row if there is no matching entry
                    test_data.drop(index, inplace=True)

            y_train = train_data['playoff']
            y_test = test_data['playoff']

            # Prepare the data for training and testing
            X_train, X_test = train_data[feature_cols], test_data[feature_cols]

            model.fit(X_train, y_train)

            # Make predictions for individual players
            player_predictions = model.predict(X_test)
            prediction_probabilities = model.predict_proba(X_test)
            prediction_probabilities = np.around(prediction_probabilities, decimals=4)

            # Evaluate the model
            accuracy = accuracy_score(y_test, player_predictions)
            precision = precision_score(y_test, player_predictions)
            recall = recall_score(y_test, player_predictions)
            f1 = recall_score(y_test, player_predictions)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
        
        accuracy_average = round(sum(accuracies)/len(accuracies), 3)
        if accuracy_average > best_accuracy:
            best_accuracy = accuracy_average
            best_model = model
            best_accuracies = accuracies
            best_precisions = precisions
            best_recalls = recalls
            best_f1s = f1s

duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("Random Forest Metrics Box Plot")

plt.show()

In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#feature importance
importances = best_model.feature_importances_
importance_graph(importances, X_train)

## Year 11

In [None]:
pred = predict_playoffs(best_model, 'random_forest')
pred

# SVM

In [None]:
C = [0.1, 1, 10, 100, 1000]
gamma = ['scale', 'auto']  # Kernel coefficient
kernel = ['linear', 'rbf', 'poly', 'sigmoid']  # Type of SVM

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None

start = time.time()

for c in C:
    for g in gamma:
        for k in kernel:
            accuracies = []
            precisions = []
            recalls = []
            f1s = []
            model = SVC(kernel=k, C=c, gamma=g, probability=True)

            for i in range(yearsInit, 11):
                train_years = range(1,i)
                test_year = i

                train_data = df[df['year'].isin(train_years)]
                replace_data = df[df['year'] == test_year-1]
                test_data = df[df['year'] == test_year]

                # replace data with last years data
                key_columns = ['tmID_encoded', 'playerID_encoded']
                
                for index, current_row in test_data.iterrows():
                    common_key = tuple(current_row[key_columns])

                    # Check if there's a matching entry in the previous year's dataframe
                    matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

                    if not matching_entry.empty:
                        # Replace values in columns_to_replace with data from the matching entry
                        for column in features_to_replace:
                            test_data.at[index, column] = matching_entry.iloc[0][column]
                    # check if the team is in the list of teams that changed names
                    elif current_row[key_columns[0]] in team_match_encoded:
                        # replace the teamID with the new teamID
                        test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
                    else:
                        # Remove the row if there is no matching entry
                        test_data.drop(index, inplace=True)

                y_train = train_data['playoff']
                y_test = test_data['playoff']

                # Prepare the data for training and testing
                X_train, X_test = train_data[feature_cols], test_data[feature_cols]

                model.fit(X_train, y_train)

                # Make predictions for individual players
                player_predictions = model.predict(X_test)
                prediction_probabilities = model.predict_proba(X_test)

                
                # convert pred_array to 4 decimal places
                prediction_probabilities = np.around(prediction_probabilities, decimals=4)

                # Evaluate the model
                accuracy = accuracy_score(y_test, player_predictions)
                precision = precision_score(y_test, player_predictions)
                recall = recall_score(y_test, player_predictions)
                f1 = recall_score(y_test, player_predictions)
                accuracies.append(accuracy)
                precisions.append(precision)
                recalls.append(recall)
                f1s.append(f1)
                print(f"year {i} done")
            
            accuracy_average = round(sum(accuracies)/len(accuracies), 3)
            if accuracy_average > best_accuracy:
                best_accuracy = accuracy_average
                best_model = model
                best_accuracies = accuracies
                best_precisions = precisions
                best_recalls = recalls
                best_f1s = f1s

duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("SVC Metrics Box Plot")

plt.show()

In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#feature importance graph
perm = PermutationImportance(best_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())


## Year 11

In [None]:
pred = predict_playoffs(best_model, 'svm')
pred

# KNN

In [None]:
n_neighbors = [i for i in range(1, 20)]
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None

start = time.time()

for n in n_neighbors:
    for w in weights:
        for m in metric:
            accuracies = []
            precisions = []
            recalls = []
            f1s = []
            model = KNeighborsClassifier(n_neighbors=n, weights=w, metric=m)

            for i in range(yearsInit, 11):
                train_years = range(1,i)
                test_year = i

                train_data = df[df['year'].isin(train_years)]
                
                
                replace_data = df[df['year'] == test_year-1]
                test_data = df[df['year'] == test_year]
                
                

                # replace data with last years data
                key_columns = ['tmID_encoded', 'playerID_encoded']
                
                for index, current_row in test_data.iterrows():
                    common_key = tuple(current_row[key_columns])

                    # Check if there's a matching entry in the previous year's dataframe
                    matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

                    if not matching_entry.empty:
                        # Replace values in columns_to_replace with data from the matching entry
                        for column in features_to_replace:
                            test_data.at[index, column] = matching_entry.iloc[0][column]
                    # check if the team is in the list of teams that changed names
                    elif current_row[key_columns[0]] in team_match_encoded:
                        # replace the teamID with the new teamID
                        test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
                    else:
                        # Remove the row if there is no matching entry
                        test_data.drop(index, inplace=True)
                
                

                y_train = train_data['playoff']
                y_test = test_data['playoff']

                # Prepare the data for training and testing
                X_train, X_test = train_data[feature_cols], test_data[feature_cols]

                
                model.fit(X_train, y_train)

                # Make predictions for individual players
                player_predictions = model.predict(X_test)
                prediction_probabilities = model.predict_proba(X_test)
                prediction_probabilities = np.around(prediction_probabilities, decimals=4)


                # Evaluate the model
                accuracy = accuracy_score(y_test, player_predictions)
                precision = precision_score(y_test, player_predictions)
                recall = recall_score(y_test, player_predictions)
                f1 = recall_score(y_test, player_predictions)
                accuracies.append(accuracy)
                precisions.append(precision)
                recalls.append(recall)
                f1s.append(f1)
                print(f"year {i} done")

            accuracy_average = round(sum(accuracies)/len(accuracies), 3)
            if accuracy_average > best_accuracy:
                best_accuracy = accuracy_average
                best_model = model
                best_accuracies = accuracies
                best_precisions = precisions
                best_recalls = recalls
                best_f1s = f1s
                
duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("KNN Metrics Box Plot")

plt.show()

In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#importance graph
results = permutation_importance(best_model, X_test, y_test, scoring='accuracy')
importances = results.importances_mean
importance_graph(importances, X_train)

## Year 11

In [None]:
pred = predict_playoffs(best_model, 'nearest_neighbors')
pred

# Naive Bayes

In [None]:
var_smoothing = [1e-09, 1e-08, 1e-07, 1e-06, 1e-05]

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None

start = time.time()

for v in var_smoothing:
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    model = GaussianNB(var_smoothing=v)

    for i in range(yearsInit, 11):
        train_years = range(1,i)
        test_year = i

        train_data = df[df['year'].isin(train_years)]
        
        
        replace_data = df[df['year'] == test_year-1]
        test_data = df[df['year'] == test_year]
        
        

        # replace data with last years data
        key_columns = ['tmID_encoded', 'playerID_encoded']
        
        for index, current_row in test_data.iterrows():
            common_key = tuple(current_row[key_columns])

            # Check if there's a matching entry in the previous year's dataframe
            matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

            if not matching_entry.empty:
                # Replace values in columns_to_replace with data from the matching entry
                for column in features_to_replace:
                    test_data.at[index, column] = matching_entry.iloc[0][column]
            # check if the team is in the list of teams that changed names
            elif current_row[key_columns[0]] in team_match_encoded:
                # replace the teamID with the new teamID
                test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
            else:
                # Remove the row if there is no matching entry
                test_data.drop(index, inplace=True)
        
        

        y_train = train_data['playoff']
        y_test = test_data['playoff']

        # Prepare the data for training and testing
        X_train, X_test = train_data[feature_cols], test_data[feature_cols]

        
        model.fit(X_train, y_train)

        # Make predictions for individual players
        player_predictions = model.predict(X_test)
        prediction_probabilities = model.predict_proba(X_test)
        prediction_probabilities = np.around(prediction_probabilities, decimals=4)


        # Evaluate the model
        accuracy = accuracy_score(y_test, player_predictions)
        precision = precision_score(y_test, player_predictions)
        recall = recall_score(y_test, player_predictions)
        f1 = recall_score(y_test, player_predictions)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        print(f"year {i} done")
    accuracy_average = round(sum(accuracies)/len(accuracies), 3)
    if accuracy_average > best_accuracy:
        best_accuracy = accuracy_average
        best_model = model
        best_accuracies = accuracies
        best_precisions = precisions
        best_recalls = recalls
        best_f1s = f1s
        
duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("Naive Bayes Metrics Box Plot")

plt.show()

In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#feature importance
importances = best_model.theta_[0]
importance_graph(importances, X_train)

## Year 11

In [None]:
pred = predict_playoffs(best_model, 'naive_bayes')
pred

# Neural Network

In [None]:

best_accuracies = []
best_precisions = []
best_recalls = []
best_f1s = []
best_accuracy = 0
best_model = None


layers = [(x,) for x in range(25, 200, 25)] 
layers += [(x, y) for x in range(25, 200, 25) for y in range(25, 200, 25)] 

hidden_layer_sizes = layers
activation =  ['relu', 'logistic', 'tanh']
alpha = [0.0001, 0.001, 0.01]

start = time.time()

for h in hidden_layer_sizes:
    for a in activation:
        for alp in alpha:
            accuracies = []
            precisions = []
            recalls = []
            f1s = []
            model = MLPClassifier(max_iter=500, learning_rate_init=0.001, batch_size=64, hidden_layer_sizes=h, activation=a, alpha=alp)

            for i in range(yearsInit, 11):
                train_years = range(1,i)
                test_year = i

                train_data = df[df['year'].isin(train_years)]
                
                
                replace_data = df[df['year'] == test_year-1]
                test_data = df[df['year'] == test_year]
                
                

                # replace data with last years data
                key_columns = ['tmID_encoded', 'playerID_encoded']
                
                for index, current_row in test_data.iterrows():
                    common_key = tuple(current_row[key_columns])

                    # Check if there's a matching entry in the previous year's dataframe
                    matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

                    if not matching_entry.empty:
                        # Replace values in columns_to_replace with data from the matching entry
                        for column in features_to_replace:
                            test_data.at[index, column] = matching_entry.iloc[0][column]
                    # check if the team is in the list of teams that changed names
                    elif current_row[key_columns[0]] in team_match_encoded:
                        # replace the teamID with the new teamID
                        test_data.at[index, key_columns[0]] = team_match_encoded[current_row[key_columns[0]]]
                    else:
                        # Remove the row if there is no matching entry
                        test_data.drop(index, inplace=True)
                
                

                y_train = train_data['playoff']
                y_test = test_data['playoff']

                # Prepare the data for training and testing
                X_train, X_test = train_data[feature_cols], test_data[feature_cols]

                
                model.fit(X_train, y_train)

                # Make predictions for individual players
                player_predictions = model.predict(X_test)
                prediction_probabilities = model.predict_proba(X_test)
                prediction_probabilities = np.around(prediction_probabilities, decimals=4)


                # Evaluate the model
                accuracy = accuracy_score(y_test, player_predictions)
                precision = precision_score(y_test, player_predictions)
                recall = recall_score(y_test, player_predictions)
                f1 = recall_score(y_test, player_predictions)
                accuracies.append(accuracy)
                precisions.append(precision)
                recalls.append(recall)
                f1s.append(f1)
                print(f"year {i} done")
            accuracy_average = round(sum(accuracies)/len(accuracies), 3)
            if accuracy_average > best_accuracy:
                best_accuracy = accuracy_average
                best_model = model
                best_accuracies = accuracies
                best_precisions = precisions
                best_recalls = recalls
                best_f1s = f1s
                
duration = time.time() - start

In [None]:
pred_df = create_predictions_df(X_test, player_predictions, prediction_probabilities)
pred_df

In [None]:
data = pd.DataFrame({'Accuracy': best_accuracies, 'Precision': best_precisions, 'Recall': best_recalls, 'F1 Score': best_f1s})
# Create a box plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 1, 1)  # Create a single subplot for the box plot
sns.boxplot(data=data, palette="Set3")
plt.title("Neural Network Metrics Box Plot")

plt.show()

In [None]:
accuracy_average = round(sum(best_accuracies)/len(best_accuracies), 3)
precision_average = round(sum(best_precisions)/len(best_precisions), 3)
recall_average = round(sum(best_recalls)/len(best_recalls), 3)
f1s_average = round(sum(best_f1s)/len(best_f1s), 3)

print(f"Accuracy Average: {accuracy_average*100}%")
print(f"Precision Average: {precision_average*100}%")
print(f"Recall Average: {recall_average*100}%")
print(f"F1 Average: {f1s_average*100}%")

In [None]:
print(f"Duration: {duration} seconds")

In [None]:
#Feature importance graph
perm = PermutationImportance(best_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

## Year 11

In [None]:
pred = predict_playoffs(best_model, 'neural_network')
pred