In [451]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [452]:
# Load the datasets
awards_players_df = pd.read_csv('../dataset/awards_players.csv')
coaches_df = pd.read_csv('../dataset/coaches.csv')
players_df = pd.read_csv('../dataset/players.csv')
players_teams_df = pd.read_csv('../dataset/players_teams.csv')
series_post_df = pd.read_csv('../dataset/series_post.csv')
teams_df = pd.read_csv('../dataset/teams.csv')
teams_post_df = pd.read_csv('../dataset/teams_post.csv')

In [453]:
# Remove useless columns from the datasets
awards_players_df = awards_players_df.drop(columns=['lgID'])
players_df = players_df.drop(columns=['firstseason', 'lastseason', 'deathDate'])
coaches_df = coaches_df.drop(columns=['lgID'])
series_post_df = series_post_df.drop(columns=['lgIDLoser', 'lgIDWinner'])
teams_post_df = teams_post_df.drop(columns=['lgID'])
teams_df = teams_df.drop(columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
players_teams_df = players_teams_df.drop(columns=['lgID'])

In [454]:
# Display the first few rows of each cleaned DataFrame (optional)
awards_players_df.head()

Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game Most Valuable Player,1
1,leslili01w,All-Star Game Most Valuable Player,2
2,leslili01w,All-Star Game Most Valuable Player,3
3,teaslni01w,All-Star Game Most Valuable Player,4
4,swoopsh01w,All-Star Game Most Valuable Player,6


In [455]:
players_df.head()

Unnamed: 0,bioID,pos,height,weight,college,collegeOther,birthDate
0,abrahta01w,C,74.0,190,George Washington,,1975-09-27
1,abrossv01w,F,74.0,169,Connecticut,,1980-07-09
2,adairje01w,C,76.0,197,George Washington,,1986-12-19
3,adamsda01w,F-C,73.0,239,Texas A&M,Jefferson College (JC),1989-02-19
4,adamsjo01w,C,75.0,180,New Mexico,,1981-05-24


In [456]:
coaches_df.head()

Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses
0,adamsmi01w,5,WAS,0,17,17,1,2
1,adubari99w,1,NYL,0,20,12,4,3
2,adubari99w,2,NYL,0,21,11,3,3
3,adubari99w,3,NYL,0,18,14,4,4
4,adubari99w,4,NYL,0,16,18,0,0


In [457]:
series_post_df.head()

Unnamed: 0,year,round,series,tmIDWinner,tmIDLoser,W,L
0,1,FR,A,CLE,ORL,2,1
1,1,FR,B,NYL,WAS,2,0
2,1,FR,C,LAS,PHO,2,0
3,1,FR,D,HOU,SAC,2,0
4,1,CF,E,HOU,LAS,2,0


In [458]:
teams_post_df.head()

Unnamed: 0,year,tmID,W,L
0,1,HOU,6,0
1,1,ORL,1,2
2,1,CLE,3,3
3,1,WAS,0,2
4,1,NYL,4,3


In [459]:
teams_df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,9,ATL,ATL,EA,7,N,,,,Atlanta Dream,...,34,1,16,3,14,2,18,6825,141379,Philips Arena
1,10,ATL,ATL,EA,2,Y,L,,,Atlanta Dream,...,34,12,5,6,11,10,12,6950,120737,Philips Arena
2,1,CHA,CHA,EA,8,N,,,,Charlotte Sting,...,32,5,11,3,13,5,16,6475,90963,Charlotte Coliseum
3,2,CHA,CHA,EA,4,Y,W,W,L,Charlotte Sting,...,32,11,5,7,9,15,6,6500,105525,Charlotte Coliseum
4,3,CHA,CHA,EA,2,Y,L,,,Charlotte Sting,...,32,11,5,7,9,12,9,6450,106670,Charlotte Coliseum


In [460]:
players_teams_df.head()

Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostBlocks,PostTurnovers,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0,0
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0,0
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,1,8,8,22,6,8,8,7,3,0
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,2,3,7,23,8,4,2,8,2,0
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0,0


In [461]:
# Merge players, teams, and awards data
players_teams_merged = pd.merge(players_df, players_teams_df, left_on='bioID', right_on='playerID')
players_teams_awards = pd.merge(players_teams_merged, awards_players_df, on=['year', 'playerID'], how='left')

# Merge teams and post-season data
teams_plus_post = pd.merge(teams_df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge teams, coaches and rename columns
teams_coaches = pd.merge(teams_plus_post, coaches_df, on=['year', 'tmID'], how='left')
teams_coaches.rename(columns={
    'won_x': 'won_team',
    'lost_x': 'lost_team',
    'won_y': 'won_coach',
    'lost_y': 'lost_coach'
}, inplace=True)

# Merge final data
final = pd.merge(teams_coaches, players_teams_awards, on=['year', 'tmID'], how='left')


In [462]:
# Drop unnecessary columns from the final dataframe
final = final.drop(columns=[
    'name',
    'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm', 'o_3pa', 'o_oreb', 'o_dreb', 'o_reb',
    'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_pts',
    'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm', 'd_3pa', 'd_oreb', 'd_dreb', 'd_reb',
    'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_pts', 'min', 'attend', 'arena',
    'height', 'weight', 'college', 'collegeOther', 'birthDate', 'playerID', 'pos'
])

# Save the intermediate final dataframe
final.to_csv('final.csv', index=False)

In [463]:
# Define award scores
award_scores = {
    'All-Star Game Most Valuable Player': 7,
    'Coach of the Year': 10,
    'Defensive Player of the Year': 7,
    'Kim Perrot Sportsmanship Award': 0,
    'Most Improved Player': 5,
    'Most Valuable Player': 10,
    'Rookie of the Year': 5,
    'Sixth Woman of the Year': 6,
    'WNBA Finals Most Valuable Player': 8,
    'WNBA All-Decade Team': 6,
    'WNBA All Decade Team Honorable Mention': 4
}

# Map the award scores to the dataframe
final['award_score'] = final['award'].map(award_scores).fillna(0)


In [464]:
# List of columns to group by (excluding 'award' and 'award_score')
columns_to_group_by = ['bioID', 'year', 'stint_y']

# Define aggregation functions for each column
aggregation_functions = { 'award_score': 'sum', 'tmID': 'first',  'franchID': 'first', 'confID': 'first', 'rank': 'first', 'playoff': 'first', 'firstRound': 'first', 'semis': 'first', 'finals': 'first', 'won_team': 'first', 'lost_team': 'first', 'GP_x': 'first', 'homeW': 'first', 'homeL': 'first', 'awayW': 'first', 'awayL': 'first', 'confW': 'first', 'confL': 'first', 'W': 'first', 'L': 'first', 'coachID': 'first', 'stint_x': 'first', 'won_coach': 'first', 'lost_coach': 'first', 'post_wins': 'first', 'post_losses': 'first', 'GP_y': 'first', 'GS': 'first', 'minutes': 'first', 'points': 'first', 'oRebounds': 'first', 'dRebounds': 'first', 'rebounds': 'first', 'assists': 'first', 'steals': 'first', 'blocks': 'first', 'turnovers': 'first', 'PF': 'first', 'fgAttempted': 'first', 'fgMade': 'first', 'ftAttempted': 'first', 'ftMade': 'first', 'threeAttempted': 'first', 'threeMade': 'first', 'dq': 'first', 'PostGP': 'first', 'PostGS': 'first', 'PostMinutes': 'first', 'PostPoints': 'first', 'PostoRebounds': 'first', 'PostdRebounds': 'first', 'PostRebounds': 'first', 'PostAssists': 'first', 'PostSteals': 'first', 'PostBlocks': 'first', 'PostTurnovers': 'first', 'PostPF': 'first', 'PostfgAttempted': 'first', 'PostfgMade': 'first', 'PostftAttempted': 'first', 'PostftMade': 'first', 'PostthreeAttempted': 'first', 'PostthreeMade': 'first', 'PostDQ': 'first'}

# Aggregate by summing the award scores
aggregated = final.groupby(columns_to_group_by, as_index=False).agg(aggregation_functions)

# Rename 'award_score' to 'total_award_score'
aggregated.rename(columns={'award_score': 'total_award_score'}, inplace=True)

# Save the cleaned and aggregated data
aggregated.to_csv('final_aggregated.csv', index=False)

In [465]:
# Define coefficients for player statistics
coefficients = {
    'minutes': 0.2,
    'points': 0.45,
    'oRebounds': 0.1,
    'dRebounds': 0.1,
    'rebounds': 0.15,
    'assists': 0.25,
    'steals': 0.2,
    'blocks': 0.35,
    'turnovers': -0.3,  # Negative coefficient for turnovers
    'PF': 0.1,
    'fgAttempted': 0.05,
    'fgMade': 0.1,
    'ftAttempted': 0.1,
    'ftMade': 0.2,
    'threeAttempted': 0.15,
    'threeMade': 0.25,
    'dq': -0.4  # Negative coefficient for dq
}

# List of columns to be used in the weighted sum calculation
columns_to_use = list(coefficients.keys())

# Function to calculate the weighted sum based on coefficients
def calculate_weighted_sum(row):
    total = 0
    for col in columns_to_use:
        total += row[col] * coefficients[col]
    return total

# Apply the function to calculate the weighted sum and store it in a new column
aggregated['weighted_score'] = aggregated.apply(calculate_weighted_sum, axis=1)


In [466]:
# Define coefficients for post-season statistics
post_coefficients = {
    'PostMinutes': 0.2,
    'PostPoints': 0.45,
    'PostoRebounds': 0.1,
    'PostdRebounds': 0.1,
    'PostRebounds': 0.15,
    'PostAssists': 0.25,
    'PostSteals': 0.2,
    'PostBlocks': 0.35,
    'PostTurnovers': -0.3,  # Negative coefficient for turnovers
    'PostPF': 0.1,
    'PostfgAttempted': 0.05,
    'PostfgMade': 0.1,
    'PostftAttempted': 0.1,
    'PostftMade': 0.2,
    'PostthreeAttempted': 0.15,
    'PostthreeMade': 0.25,
    'PostDQ': -0.4  # Negative coefficient for dq
}

# List of 'Post' columns to be used in the weighted sum calculation
post_columns_to_use = list(post_coefficients.keys())

# Function to calculate the weighted sum based on 'Post' coefficients
def calculate_post_weighted_sum(row):
    total = 0
    for col in post_columns_to_use:
        total += row[col] * post_coefficients[col]
    return total

# Apply the function to calculate the post-season weighted sum and store it in a new column
aggregated['post_weighted_score'] = aggregated.apply(calculate_post_weighted_sum, axis=1)


In [467]:
# Remove the individual columns used in the calculation
aggregated.drop(columns=columns_to_use, inplace=True)

# Remove the individual 'Post' columns used in the calculation
aggregated.drop(columns=post_columns_to_use, inplace=True)

aggregated['playoff'] = aggregated['playoff'].map({'Y': 1, 'N': 0})
aggregated = aggregated.sort_values(['tmID', 'year'])

# Save the updated dataframe to CSV
aggregated.to_csv('aggregated_with_weighted_score.csv', index=False)


In [468]:
# Generate 'playoffNextYear' label by shifting the 'playoff' column

aggregated['playoffNextYear'] = aggregated.groupby('tmID')['playoff'].shift(-1)
aggregated.loc[aggregated['tmID'] != aggregated['tmID'].shift(-1), 'playoffNextYear'] = None
aggregated['playoffNextYear'] = aggregated['playoffNextYear'].fillna(0)

# Save the updated dataframe to CSV
aggregated.to_csv('aggregated_with_playoffNextYear.csv', index=False)

# Example: Aggregated features based on previous 3 years
aggregated['avg_total_award_score_last3'] = (
    aggregated.groupby('tmID')['total_award_score']
    .apply(lambda x: x.shift(1).rolling(window=3).mean())
)

aggregated['sum_weighted_score_last3'] = (
    aggregated.groupby('tmID')['weighted_score']
    .apply(lambda x: x.shift(1).rolling(window=3).sum())
)

# Fill NaN values appropriately for the first few years
aggregated.fillna(0, inplace=True)

# Define new feature set
features = ['avg_total_award_score_last3', 'sum_weighted_score_last3']

# Define train-test split (using year as a threshold)
train_data = aggregated[aggregated['year'] <= 5]  # Adjust year threshold as needed
test_data = aggregated[aggregated['year'].isin([6, 7])]

# Separate features and labels for training and testing
x_train, y_train = train_data[features], train_data[label]
x_test, y_test = test_data[features], test_data[label]

# Train the model
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

# Make predictions
# Make sure to create a copy of test_data to avoid the SettingWithCopyWarning
test_data = test_data.copy()

# Now you can safely modify test_data without the warning
test_data.loc[:, 'proba_class_1'] = model.predict_proba(x_test)[:, 1]
test_data.loc[:, 'pred'] = model.predict(x_test)


# Evaluate model accuracy
print("Accuracy:", accuracy_score(y_test, test_data['pred']))
print("Classification Report:\n", classification_report(y_test, test_data['pred']))

# Display the first few rows of test_data to see probabilities and predictions
print(test_data[['proba_class_1', 'pred']])

TypeError: incompatible index of inserted column with frame index