In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Load the datasets
awards_players_df = pd.read_csv('../../dataset/awards_players.csv')
coaches_df = pd.read_csv('../../dataset/coaches.csv')
players_df = pd.read_csv('../../dataset/players.csv')
players_teams_df = pd.read_csv('../../dataset/players_teams.csv')
series_post_df = pd.read_csv('../../dataset/series_post.csv')
teams_df = pd.read_csv('../../dataset/teams.csv')
teams_post_df = pd.read_csv('../../dataset/teams_post.csv')

# Remove useless columns from the datasets
awards_players_df = awards_players_df.drop(columns=['lgID'])
players_df = players_df.drop(columns=['firstseason', 'lastseason', 'deathDate'])
coaches_df = coaches_df.drop(columns=['lgID'])
series_post_df = series_post_df.drop(columns=['lgIDLoser', 'lgIDWinner'])
teams_post_df = teams_post_df.drop(columns=['lgID'])
teams_df = teams_df.drop(columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
players_teams_df = players_teams_df.drop(columns=['lgID'])

# Display the first few rows of each cleaned DataFrame (optional)
#print(awards_players_df.head())
#print(players_df.head())
#print(coaches_df.head())
#print(series_post_df.head())
#print(teams_post_df.head())
#print(teams_df.head())
#print(players_teams_df.head())

# Merge datasets
players_teams_merged = pd.merge(players_df, players_teams_df, left_on='bioID', right_on='playerID')

team_player_stats = players_teams_merged.groupby('tmID').agg({
    'points': 'mean',
    'assists': 'mean',
    'rebounds': 'mean'
}).reset_index()


teams_df = teams_df.sort_values(['tmID', 'year'])
teams_df['playoffNextYear'] = teams_df['playoff'].shift(-1)
teams_df.loc[teams_df['franchID'] != teams_df['franchID'].shift(-1), 'playoffNextYear'] = None
teams_df['playoffNextYear'] = teams_df['playoffNextYear'].map({'Y': 1, 'N': 0})
teams_df['playoffNextYear'] = teams_df['playoffNextYear'].fillna(0)

teams_full = pd.merge(teams_df, team_player_stats, on='tmID')

features = ['points', 'assists', 'rebounds']
target = 'playoffNextYear'

train_data = teams_full[teams_full['year'] < 5]
test_data = teams_full[teams_full['year'].isin([6,7])]

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.4444444444444444
Classification Report:
               precision    recall  f1-score   support

         0.0       0.36      0.45      0.40        11
         1.0       0.54      0.44      0.48        16

    accuracy                           0.44        27
   macro avg       0.45      0.45      0.44        27
weighted avg       0.46      0.44      0.45        27

