In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import graphviz
from sklearn.tree import export_graphviz

In [None]:
df = pd.read_csv('combinedData.csv')

Encoding

In [None]:
id_features = ['tmID','playerID', 'college', 'collegeOther', 'coachID']
categorical_features = [ 'pos', 'award', 'confID', 'results']
df = pd.get_dummies(df, columns=categorical_features)


In [None]:
for feature in id_features:
    label_encoder = LabelEncoder()
    df[f'{feature}_encoded'] = label_encoder.fit_transform(df[feature])

df['playoff'] = df['playoff'].replace({'N': 0, 'Y': 1})

df = df.drop(columns=id_features)

In [None]:
#YYYY-MM-DD to unix
df['birthDataUnix'] = pd.to_datetime(df['birthDate']).apply(lambda x: int(x.timestamp()))
df = df.drop(columns=['birthDate'])

df_original=df
feature_cols = [col for col in df.columns if col not in ['playoff']]

# Data Leakage

features not to replace with last years:
 - tmID
 - year
 - playerID
 - stint
 - pos
 - height
 - weight
 - college
 - college other
 - birthDate
 - age
 - coachId
 - coachStint
 - confID

In [None]:
features_to_replace = [col for col in feature_cols if col not in [
    'tmID', 'year', 'playerID', 'stint', 'pos', 'height', 'weight', 'college',
    'collegeOther', 'birthDate', 'age', 'coachID', 'coachStint', 'confID',
    'tmID_encoded', 'playerID_encoded', 'college_encoded','collegeOther_encoded',
    'coachID_encoded','birthDataUnix','confID_EA','confID_WE','pos_C','pos_C-F',
    'pos_F','pos_F-C','pos_F-G','pos_G','pos_G-F']]

print(features_to_replace)

In [None]:
label_encoder = LabelEncoder()
accuracies = []
model = DecisionTreeClassifier()

for i in range(2, 11):
    train_years = range(1,i)
    test_year = i

    train_data = df[df['year'].isin(train_years)]
    #print("============================================================================")
    #print(train_data)
    replace_data = df[df['year'] == test_year-1]
    test_data = df[df['year'] == test_year]
    #print(test_data)
    #print("============================================================================")

    # replace data with last years data
    key_columns = ['tmID_encoded', 'playerID_encoded']
    
    for index, current_row in test_data.iterrows():
        common_key = tuple(current_row[key_columns])

        # Check if there's a matching entry in the previous year's dataframe
        matching_entry = replace_data[(replace_data[key_columns[0]] == current_row[key_columns[0]]) & (replace_data[key_columns[1]] == current_row[key_columns[1]])]

        if not matching_entry.empty:
            # Replace values in columns_to_replace with data from the matching entry
            for column in features_to_replace:
                test_data.at[index, column] = matching_entry.iloc[0][column]
        else:
            # Remove the row if there is no matching entry
            test_data.drop(index, inplace=True)
    
    #print(test_data)

    y_train = train_data['playoff']
    y_test = test_data['playoff']

    # Prepare the data for training and testing using unaggregated feature data (X)
    X_train, X_test = train_data[feature_cols], test_data[feature_cols]

    # Train your model (e.g., Random Forest)
    model.fit(X_train, y_train)

    # Make predictions for individual players
    player_predictions = model.predict(X_test)


    # Evaluate the model using the encoded labels
    accuracy = accuracy_score(y_test, player_predictions)
    accuracies.append(accuracy)


In [None]:
print(accuracies, ' ')
print(sum(accuracies)/9)

In [None]:
dot_data = export_graphviz(model, out_file=None,
                          feature_names=X_train.columns, # Specify your feature names
                          class_names=['No Playoff', 'Playoff'], # Specify your class names
                          filled=True, rounded=True, special_characters=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree") # This will save the tree as 'decision_tree.pdf'
graph.view("decision_tree")   # This will open the tree in your default PDF viewer
