# WNBA All-Decade Team Award

In [103]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

awards_data = pd.read_csv("../../initial_data/awards_players.csv")
teams_data = pd.read_csv("../../initial_data/teams.csv")
players_teams_clean = pd.read_csv("../awards_data/players_teams_clean.csv")
players_stats = pd.read_csv("../../initial_data/players_teams.csv")[['playerID', 'year', 'tmID', 'minutes', 'points', 'rebounds', 
                                                                     'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted', 
                                                                     'fgMade', 'ftAttempted', 'ftMade']]
players_info = pd.read_csv("../../initial_data/players_clean.csv")

## Create All-Decade Team Data

In [104]:
# Join dataset players_teams_clean data with teams_data (team_wins, team_losses, rank, playoff_made)
# Note: players_teams_clean has 'team' column, teams_data has 'tmID' column
alldec_data = players_stats.merge(
    teams_data[['year', 'tmID', 'won', 'lost', 'rank', 'playoff', 'confID']], 
    left_on=['year', 'tmID'], 
    right_on=['year', 'tmID'], 
    how='left'
)

# Rename columns for clarity
alldec_data.rename(columns={
    'won': 'team_wins', 
    'lost': 'team_losses', 
    'rank': 'conference_rank', 
    'playoff': 'playoff_made'
}, inplace=True)

# Drop the duplicate tmID column (we already have 'team')
alldec_data.drop(columns=['tmID'], inplace=True)

# Change Playoff from N / Y to 0 / 1
alldec_data['playoff_made'] = alldec_data['playoff_made'].map({'Y': 1, 'N': 0})
# Change ConfID to numeric  EA / WE to 0 / 1
alldec_data['confID'] = alldec_data['confID'].astype('category').cat.codes

print(f"All-Decade data shape: {alldec_data.shape}")
print(f"\nColumns: {alldec_data.columns.tolist()}")
print(f"\nSample data:")
print(alldec_data.head())

# Save csv (relative path from current location)
alldec_data.to_csv("alldec_data.csv", index=False)
print(f"\n✅ Saved to alldec_data.csv")


All-Decade data shape: (1876, 19)

Columns: ['playerID', 'year', 'minutes', 'points', 'rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted', 'fgMade', 'ftAttempted', 'ftMade', 'team_wins', 'team_losses', 'conference_rank', 'playoff_made', 'confID']

Sample data:
     playerID  year  minutes  points  rebounds  assists  steals  blocks  \
0  abrossv01w     2      846     343       174       53      42       9   
1  abrossv01w     3      805     314       146       60      42      10   
2  abrossv01w     4      792     318       141       82      44      11   
3  abrossv01w     5      462     146        74       45      30       2   
4  abrossv01w     6      777     304       107       60      48       6   

   turnovers  PF  fgAttempted  fgMade  ftAttempted  ftMade  team_wins  \
0         85  70          293     114          132      96         12   
1         92  73          316     119          116      56         10   
2         90  79          285     112        

## Function to Calculate Player Average Rating for Decade (year is chosen)

In [105]:
# Calculates Avg Rating for a player 10-year before the given year
def calculate_avg_rating(playerID, year):
    start_year = year - 10
    player_data = alldec_data[(alldec_data['playerID'] == playerID) & 
                              (alldec_data['year'] >= start_year) & 
                              (alldec_data['year'] < year)]
    if not player_data.empty:
        # Calculate EFF for each year
        player_data['EFF'] = (
            player_data['points'] + 
            player_data['rebounds'] + 
            player_data['assists'] + 
            player_data['steals'] + 
            player_data['blocks']
        ) - (
            (player_data['fgAttempted'] - player_data['fgMade']) + 
            (player_data['ftAttempted'] - player_data['ftMade']) + 
            player_data['turnovers']
        )
        return player_data['EFF'].mean()
    else:
        return np.nan
    
# Calculates other player stats over the last 10 years (For means!! Use for Confederence Rank)
def calculate_mean_stats(playerID, year, stat):
    start_year = year - 10
    player_data = alldec_data[(alldec_data['playerID'] == playerID) & 
                              (alldec_data['year'] >= start_year) & 
                              (alldec_data['year'] < year)]
    if not player_data.empty:
        return player_data[stat].mean()
    else:
        return np.nan
    
# Calculates sum of player stats over the last 10 years (For totals!! Use for Wins, Losses, Minutes Played, Playoff Made)
def calculate_sum_stats(playerID, year, stat):
    start_year = year - 10
    player_data = alldec_data[(alldec_data['playerID'] == playerID) & 
                              (alldec_data['year'] >= start_year) & 
                              (alldec_data['year'] < year)]
    if not player_data.empty:
        return player_data[stat].sum()
    else:
        return np.nan

## Function that Creates New CSV Data for Year Provided Automatically

In [106]:
# Function that Creates New CSV Data for Year Provided Automatically using calculate_avg_rating, calculate_mean_stats and calculate_sum_stats (doesnt include stats from year provided, only decade before)
def create_alldecade_data_for_year(target_year):
    start_year = target_year - 10
    # Get unique players who played in the decade before target_year
    past_players = alldec_data[(alldec_data['year'] >= start_year) & (alldec_data['year'] < target_year)]['playerID'].unique()
    
    records = []
    for playerID in past_players:
        record = {
            'playerID': playerID,
            'year': target_year,
            'avg_rating': calculate_avg_rating(playerID, target_year),
            'avg_conference_rank': calculate_mean_stats(playerID, target_year, 'conference_rank'),
            'team_wins': calculate_sum_stats(playerID, target_year, 'team_wins'),
            'team_losses': calculate_sum_stats(playerID, target_year, 'team_losses'),
            'minutes': calculate_sum_stats(playerID, target_year, 'minutes'),
            'playoffs_made': calculate_sum_stats(playerID, target_year, 'playoff_made')
        }
        records.append(record)
    
    new_data = pd.DataFrame(records)
    new_data.to_csv(f"yearly_data/alldecade_data_{target_year}.csv", index=False)
    print(f"\n✅ Saved to alldecade_data_{target_year}.csv")



## Define features to use in the models

In [107]:
# Define feature columns for the model
# Using year N-1 stats to predict year N
feature_columns = [
    'avg_rating', 'avg_conference_rank', 'team_wins', 'team_losses', 'minutes', 'playoffs_made'
]

print("Features for All-Decade Team prediction:")
for i, col in enumerate(feature_columns, 1):
    print(f"{i}. {col}")


Features for All-Decade Team prediction:
1. avg_rating
2. avg_conference_rank
3. team_wins
4. team_losses
5. minutes
6. playoffs_made


## Create Dataset for Models

In [108]:
# Choose year for training
train_year = 7

# Create dataset for chosen year (in this case year 7)
create_alldecade_data_for_year(train_year)

# Add feature column to indicate if player made 'WNBA All-Decade Team' or 'WNBA All Decade Team Honorable Mention' in the year for training
alldecade_awards = awards_data[awards_data['award'].isin(['WNBA All-Decade Team', 'WNBA All Decade Team Honorable Mention'])]

# Add that feature to the dataset
alldecade_data_final = pd.read_csv(f"yearly_data/alldecade_data_{train_year}.csv")
alldecade_data_final['all_decade_team'] = alldecade_data_final.apply(
    lambda row: 1 if ((alldecade_awards['playerID'] == row['playerID']) & (alldecade_awards['year'] == row['year'])).any() else 0,
    axis=1
)

# Make all_decade_final into csv
alldecade_data_final.to_csv(f"yearly_data/alldecade_data_final_{train_year}.csv", index=False)
print(f"\n✅ Saved to alldecade_data_final_{train_year}.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave


✅ Saved to alldecade_data_7.csv

✅ Saved to alldecade_data_final_7.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data['EFF'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

## Prediction Models

### 1. Logistic Regression

#### Training the model

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Prepare data for modeling
X = alldecade_data_final[feature_columns]
y = alldecade_data_final['all_decade_team']
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_scaled, y)
# Make predictions
y_pred = model.predict(X_scaled)
# Evaluate model

print(f"\n✅ Model trained!")
print(f"Train accuracy: {model.score(X_scaled, y):.3f}")
print("\nClassification Report:")
print(classification_report(y, y_pred, target_names=['Not All-Decade', 'All-Decade']))
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))


✅ Model trained!
Train accuracy: 0.978

Classification Report:
                precision    recall  f1-score   support

Not All-Decade       0.98      1.00      0.99       397
    All-Decade       1.00      0.40      0.57        15

      accuracy                           0.98       412
     macro avg       0.99      0.70      0.78       412
  weighted avg       0.98      0.98      0.97       412


Confusion Matrix:
[[397   0]
 [  9   6]]


#### Feature Importance

In [110]:
# Feature Importance
print("\n🔍 Feature Importance:")
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'coefficient': model.coef_[0]
}).sort_values('coefficient', ascending=False)

print(feature_importance)

# Show top predictions
alldecade_data_final['all_decade_probability'] = model.predict_proba(X_scaled)[:, 1]
results = alldecade_data_final.sort_values('all_decade_probability', ascending=False)

print(f"\n🏆 Top 20 All-Decade Team Predictions:")
print(f"{'Rank':<6}{'Player':<25}{'Probability':<15}{'Actual'}")
print("-" * 70)
for idx, (_, row) in enumerate(results.head(20).iterrows(), 1):
    is_all_decade = "✅" if row['all_decade_team'] == 1 else ""
    print(f"{idx:<6}{row['playerID']:<25}{row['all_decade_probability']:.4f}{'':10}{is_all_decade}")

    


🔍 Feature Importance:
               feature  coefficient
0           avg_rating     1.427708
4              minutes     0.469829
5        playoffs_made     0.469295
1  avg_conference_rank     0.131514
3          team_losses    -0.094498
2            team_wins    -0.171178

🏆 Top 20 All-Decade Team Predictions:
Rank  Player                   Probability    Actual
----------------------------------------------------------------------
1     leslili01w               0.9059          ✅
2     catchta01w               0.8345          ✅
3     griffyo01w               0.7964          ✅
4     jacksla01w               0.7675          ✅
5     swoopsh01w               0.7213          ✅
6     holdsch01w               0.6050          ✅
7     willina01w               0.4472          
8     mcwilta01w               0.4303          
9     salesny01w               0.3778          
10    thompti01w               0.3525          ✅
11    birdsu01w                0.3114          ✅
12    miltode01w          

#### Predictions: Top 10 All-Decade Team + 11-15 Honorable Mention

In [111]:
# Predict All-Decade Team (top 10) and Honorable Mention (11-15)
print("\n🏆 WNBA All-Decade Team Predictions:\n")

# Top 10: All-Decade Team
print("=" * 70)
print("ALL-DECADE TEAM (Top 10)")
print("=" * 70)
top_10 = results.head(10)
for idx, (_, row) in enumerate(top_10.iterrows(), 1):
    is_all_decade = "✅ CORRECT" if row['all_decade_team'] == 1 else ""
    print(f"{idx:<3}. {row['playerID']:<25} Probability: {row['all_decade_probability']:.4f}  {is_all_decade}")

# 11-15: Honorable Mention
print("\n" + "=" * 70)
print("HONORABLE MENTION (Ranks 11-15)")
print("=" * 70)
honorable_mention = results.iloc[10:15]
for idx, (_, row) in enumerate(honorable_mention.iterrows(), 11):
    is_all_decade = "✅ CORRECT" if row['all_decade_team'] == 1 else ""
    print(f"{idx:<3}. {row['playerID']:<25} Probability: {row['all_decade_probability']:.4f}  {is_all_decade}")

# Summary
actual_winners = results[results['all_decade_team'] == 1]
top_10_winners = top_10[top_10['all_decade_team'] == 1]
honorable_winners = honorable_mention[honorable_mention['all_decade_team'] == 1]

print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Total All-Decade winners in dataset: {len(actual_winners)}")
print(f"Correctly predicted in Top 10: {len(top_10_winners)}")
print(f"Correctly predicted in Honorable Mention (11-15): {len(honorable_winners)}")
print(f"Total correct in Top 15: {len(top_10_winners) + len(honorable_winners)}")
print(f"Coverage rate: {((len(top_10_winners) + len(honorable_winners)) / len(actual_winners) * 100):.1f}%")


🏆 WNBA All-Decade Team Predictions:

ALL-DECADE TEAM (Top 10)
1  . leslili01w                Probability: 0.9059  ✅ CORRECT
2  . catchta01w                Probability: 0.8345  ✅ CORRECT
3  . griffyo01w                Probability: 0.7964  ✅ CORRECT
4  . jacksla01w                Probability: 0.7675  ✅ CORRECT
5  . swoopsh01w                Probability: 0.7213  ✅ CORRECT
6  . holdsch01w                Probability: 0.6050  ✅ CORRECT
7  . willina01w                Probability: 0.4472  
8  . mcwilta01w                Probability: 0.4303  
9  . salesny01w                Probability: 0.3778  
10 . thompti01w                Probability: 0.3525  ✅ CORRECT

HONORABLE MENTION (Ranks 11-15)
11 . birdsu01w                 Probability: 0.3114  ✅ CORRECT
12 . miltode01w                Probability: 0.3104  
13 . dydekma01w                Probability: 0.3021  
14 . smithta01w                Probability: 0.2993  
15 . baranel01w                Probability: 0.2986  

SUMMARY
Total All-Decade winners in 