In [50]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [51]:
# Load the datasets
awards_players_df = pd.read_csv('../dataset/awards_players.csv')
coaches_df = pd.read_csv('../dataset/coaches.csv')
players_df = pd.read_csv('../dataset/players.csv')
players_teams_df = pd.read_csv('../dataset/players_teams.csv')
series_post_df = pd.read_csv('../dataset/series_post.csv')
teams_df = pd.read_csv('../dataset/teams.csv')
teams_post_df = pd.read_csv('../dataset/teams_post.csv')

In [52]:
# Remove useless columns from the datasets
awards_players_df = awards_players_df.drop(columns=['lgID'])
players_df = players_df.drop(columns=['firstseason', 'lastseason', 'deathDate'])
coaches_df = coaches_df.drop(columns=['lgID'])
series_post_df = series_post_df.drop(columns=['lgIDLoser', 'lgIDWinner'])
teams_post_df = teams_post_df.drop(columns=['lgID'])
teams_df = teams_df.drop(columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
players_teams_df = players_teams_df.drop(columns=['lgID'])

In [53]:
# Display the first few rows of each cleaned DataFrame (optional)
awards_players_df.head()

Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game Most Valuable Player,1
1,leslili01w,All-Star Game Most Valuable Player,2
2,leslili01w,All-Star Game Most Valuable Player,3
3,teaslni01w,All-Star Game Most Valuable Player,4
4,swoopsh01w,All-Star Game Most Valuable Player,6


In [54]:
players_df.head()

Unnamed: 0,bioID,pos,height,weight,college,collegeOther,birthDate
0,abrahta01w,C,74.0,190,George Washington,,1975-09-27
1,abrossv01w,F,74.0,169,Connecticut,,1980-07-09
2,adairje01w,C,76.0,197,George Washington,,1986-12-19
3,adamsda01w,F-C,73.0,239,Texas A&M,Jefferson College (JC),1989-02-19
4,adamsjo01w,C,75.0,180,New Mexico,,1981-05-24


In [55]:
coaches_df.head()

Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses
0,adamsmi01w,5,WAS,0,17,17,1,2
1,adubari99w,1,NYL,0,20,12,4,3
2,adubari99w,2,NYL,0,21,11,3,3
3,adubari99w,3,NYL,0,18,14,4,4
4,adubari99w,4,NYL,0,16,18,0,0


In [56]:
series_post_df.head()

Unnamed: 0,year,round,series,tmIDWinner,tmIDLoser,W,L
0,1,FR,A,CLE,ORL,2,1
1,1,FR,B,NYL,WAS,2,0
2,1,FR,C,LAS,PHO,2,0
3,1,FR,D,HOU,SAC,2,0
4,1,CF,E,HOU,LAS,2,0


In [57]:
teams_post_df.head()

Unnamed: 0,year,tmID,W,L
0,1,HOU,6,0
1,1,ORL,1,2
2,1,CLE,3,3
3,1,WAS,0,2
4,1,NYL,4,3


In [58]:
teams_df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,9,ATL,ATL,EA,7,N,,,,Atlanta Dream,...,34,1,16,3,14,2,18,6825,141379,Philips Arena
1,10,ATL,ATL,EA,2,Y,L,,,Atlanta Dream,...,34,12,5,6,11,10,12,6950,120737,Philips Arena
2,1,CHA,CHA,EA,8,N,,,,Charlotte Sting,...,32,5,11,3,13,5,16,6475,90963,Charlotte Coliseum
3,2,CHA,CHA,EA,4,Y,W,W,L,Charlotte Sting,...,32,11,5,7,9,15,6,6500,105525,Charlotte Coliseum
4,3,CHA,CHA,EA,2,Y,L,,,Charlotte Sting,...,32,11,5,7,9,12,9,6450,106670,Charlotte Coliseum


In [59]:
players_teams_df.head()

Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostBlocks,PostTurnovers,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0,0
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0,0
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,1,8,8,22,6,8,8,7,3,0
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,2,3,7,23,8,4,2,8,2,0
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# Merge datasets
players_teams_merged = pd.merge(players_df, players_teams_df, left_on='bioID', right_on='playerID')
players_teams_awards = pd.merge(players_teams_merged, awards_players_df, on=['year', 'playerID'], how='left')

teams_plus_post = pd.merge(teams_df, teams_post_df, on=['year', 'tmID'], how='left')

teams_coaches = pd.merge(teams_plus_post, coaches_df, on=['year', 'tmID'], how='left')
teams_coaches.rename(columns={
    'won_x': 'won_team',
    'lost_x': 'lost_team',
    'won_y': 'won_coach',
    'lost_y': 'lost_coach'
}, inplace=True)

final = pd.merge(teams_coaches, players_teams_awards, on=['year', 'tmID'], how='left')
final.to_csv('final.csv', index=False)

player_points = final.groupby(['year', 'tmID', 'bioID']).agg(
    total_points=pd.NamedAgg(column='points', aggfunc='sum')
).reset_index()

# Columns to aggregate (already present in the final dataframe)
columns_to_aggregate = [
    'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm', 'o_3pa', 'o_oreb', 'o_dreb', 'o_reb',
    'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta',
    'd_3pm', 'd_3pa', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk',
    'd_pts', 'won_team', 'lost_team', 'GP_x', 'homeW', 'homeL', 'awayW', 'awayL', 'confW', 'confL'
]

# Grouping the data and aggregating using the columns from 'final'
grouped_data = final.groupby(['year', 'tmID']).agg(
    {col: 'first' for col in columns_to_aggregate}
).reset_index()

# Function to retrieve and sort players by points
def sorted_players(year, tmID):
    players = player_points[(player_points['year'] == year) & (player_points['tmID'] == tmID)]
    return list(players.sort_values(by='total_points', ascending=False)['bioID'])

# Add ranked player columns dynamically
def expand_players_list(row):
    players = sorted_players(row['year'], row['tmID'])
    player_dict = {f'ranked_player_{i+1}': players[i] if i < len(players) else None for i in range(13)}
    return pd.Series(player_dict)

# Apply the expansion of player lists into columns
expanded_players = grouped_data.apply(expand_players_list, axis=1)


In [61]:
team_player_stats = players_teams_awards.groupby('tmID').agg({
    'points': 'mean',
    'assists': 'mean',
    'rebounds': 'mean',
    'turnovers': 'mean',
    'blocks': 'mean',
    'steals': 'mean'
}).reset_index()



print(team_player_stats.head())

teams_df = teams_df.sort_values(['tmID', 'year'])
teams_df['playoffNextYear'] = teams_df['playoff'].shift(-1)
teams_df.loc[teams_df['franchID'] != teams_df['franchID'].shift(-1), 'playoffNextYear'] = None
teams_df['playoffNextYear'] = teams_df['playoffNextYear'].map({'Y': 1, 'N': 0})
teams_df['playoffNextYear'] = teams_df['playoffNextYear'].fillna(0)

teams_full = pd.merge(teams_df, team_player_stats, on='tmID')

features = ['points', 'assists', 'rebounds', 'won']
target = 'playoffNextYear'

teams_full.to_csv('grouped_data_with_players.csv', index=False)

  tmID      points    assists   rebounds  turnovers    blocks     steals
0  ATL  191.666667  38.111111  81.703704  39.481481  8.740741  22.629630
1  CHA  167.450549  37.780220  72.582418  36.120879  8.912088  19.285714
2  CHI  182.074074  40.425926  81.092593  36.185185  9.629630  19.481481
3  CLE  163.250000  39.211538  75.596154  35.576923  6.596154  18.711538
4  CON  202.363636  46.772727  91.636364  36.318182  9.909091  20.227273


In [62]:
teams_full.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,min,attend,arena,playoffNextYear,points,assists,rebounds,turnovers,blocks,steals
0,9,ATL,ATL,EA,7,N,,,,Atlanta Dream,...,6825,141379,Philips Arena,1.0,191.666667,38.111111,81.703704,39.481481,8.740741,22.62963
1,10,ATL,ATL,EA,2,Y,L,,,Atlanta Dream,...,6950,120737,Philips Arena,0.0,191.666667,38.111111,81.703704,39.481481,8.740741,22.62963
2,1,CHA,CHA,EA,8,N,,,,Charlotte Sting,...,6475,90963,Charlotte Coliseum,1.0,167.450549,37.78022,72.582418,36.120879,8.912088,19.285714
3,2,CHA,CHA,EA,4,Y,W,W,L,Charlotte Sting,...,6500,105525,Charlotte Coliseum,1.0,167.450549,37.78022,72.582418,36.120879,8.912088,19.285714
4,3,CHA,CHA,EA,2,Y,L,,,Charlotte Sting,...,6450,106670,Charlotte Coliseum,1.0,167.450549,37.78022,72.582418,36.120879,8.912088,19.285714


In [63]:
train_data = teams_full[teams_full['year'] < 5]
test_data = teams_full[teams_full['year'].isin([6,7])]

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

In [64]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

         0.0       0.60      0.55      0.57        11
         1.0       0.71      0.75      0.73        16

    accuracy                           0.67        27
   macro avg       0.65      0.65      0.65        27
weighted avg       0.66      0.67      0.66        27

