In [70]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

# Load the datasets
awards_players_df = pd.read_csv('../dataset/awards_players.csv')
coaches_df = pd.read_csv('../dataset/coaches.csv')
players_df = pd.read_csv('../dataset/players.csv')
players_teams_df = pd.read_csv('../dataset/players_teams.csv')
series_post_df = pd.read_csv('../dataset/series_post.csv')
teams_df = pd.read_csv('../dataset/teams.csv')
teams_post_df = pd.read_csv('../dataset/teams_post.csv')

# Remove useless columns from the datasets
awards_players_df = awards_players_df.drop(columns=['lgID'])
players_df = players_df.drop(columns=['firstseason', 'lastseason', 'deathDate'])
coaches_df = coaches_df.drop(columns=['lgID'])
series_post_df = series_post_df.drop(columns=['lgIDLoser', 'lgIDWinner'])
teams_post_df = teams_post_df.drop(columns=['lgID'])
teams_df = teams_df.drop(
    columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
players_teams_df = players_teams_df.drop(columns=['lgID'])

In [71]:
# Merge players, teams, and awards data
players_teams_merged = pd.merge(players_df, players_teams_df, left_on='bioID', right_on='playerID')
players_teams_awards = pd.merge(players_teams_merged, awards_players_df, on=['year', 'playerID'], how='left')

#remove pos, height, weight, college, collegeOther, birthDate, playerID, GP and GS
players_teams_awards = players_teams_awards.drop(
    columns=['pos', 'height', 'weight', 'college', 'collegeOther', 'birthDate', 'playerID', 'GP', 'GS', 'PostGP', 'PostGS'])


In [72]:
# Define award scores
award_scores = {
    'All-Star Game Most Valuable Player': 7,
    'Coach of the Year': 10,
    'Defensive Player of the Year': 7,
    'Kim Perrot Sportsmanship Award': 0,
    'Most Improved Player': 5,
    'Most Valuable Player': 10,
    'Rookie of the Year': 5,
    'Sixth Woman of the Year': 6,
    'WNBA Finals Most Valuable Player': 8,
    'WNBA All-Decade Team': 6,
    'WNBA All Decade Team Honorable Mention': 4
}

# Map the award scores to the dataframe
players_teams_awards['award_score'] = players_teams_awards['award'].map(award_scores).fillna(0)

# drop award column
players_teams_awards = players_teams_awards.drop(columns=['award'])

# List of columns to group by (excluding 'award_score')
columns_to_group_by = ['bioID', 'year', 'stint']

# Group by the columns and aggregate
players_teams_awards = players_teams_awards.groupby(columns_to_group_by).agg({
    'award_score': 'sum',  # Sum award scores
    **{col: 'first' for col in players_teams_awards.columns if col not in columns_to_group_by + ['award_score']}
}).reset_index()

In [73]:
# Define coefficients for player statistics
coefficients = {
    'minutes': 0.2,
    'points': 0.45,
    'oRebounds': 0.1,
    'dRebounds': 0.1,
    'rebounds': 0.15,
    'assists': 0.25,
    'steals': 0.2,
    'blocks': 0.35,
    'turnovers': -0.3,  # Negative coefficient for turnovers
    'PF': 0.1,
    'fgAttempted': 0.05,
    'fgMade': 0.1,
    'ftAttempted': 0.1,
    'ftMade': 0.2,
    'threeAttempted': 0.15,
    'threeMade': 0.25,
    'dq': -0.4  # Negative coefficient for dq
}

# List of columns to be used in the weighted sum calculation
columns_to_use = list(coefficients.keys())


# Function to calculate the weighted sum based on coefficients
def calculate_weighted_sum(row):
    total = 0
    for col in columns_to_use:
        total += row[col] * coefficients[col]
    return total


# Apply the function to calculate the weighted sum and store it in a new column
players_teams_awards['weighted_score'] = players_teams_awards.apply(calculate_weighted_sum, axis=1)

# Define coefficients for post-season statistics
post_coefficients = {
    'PostMinutes': 0.2,
    'PostPoints': 0.45,
    'PostoRebounds': 0.1,
    'PostdRebounds': 0.1,
    'PostRebounds': 0.15,
    'PostAssists': 0.25,
    'PostSteals': 0.2,
    'PostBlocks': 0.35,
    'PostTurnovers': -0.3,  # Negative coefficient for turnovers
    'PostPF': 0.1,
    'PostfgAttempted': 0.05,
    'PostfgMade': 0.1,
    'PostftAttempted': 0.1,
    'PostftMade': 0.2,
    'PostthreeAttempted': 0.15,
    'PostthreeMade': 0.25,
    'PostDQ': -0.4  # Negative coefficient for dq
}

# List of 'Post' columns to be used in the weighted sum calculation
post_columns_to_use = list(post_coefficients.keys())


# Function to calculate the weighted sum based on 'Post' coefficients
def calculate_post_weighted_sum(row):
    total = 0
    for col in post_columns_to_use:
        total += row[col] * post_coefficients[col]
    return total


# Apply the function to calculate the post-season weighted sum and store it in a new column
players_teams_awards['post_weighted_score'] = players_teams_awards.apply(calculate_post_weighted_sum, axis=1)

# Remove the individual columns used in the calculation
players_teams_awards.drop(columns=columns_to_use, inplace=True)

# Remove the individual 'Post' columns used in the calculation
players_teams_awards.drop(columns=post_columns_to_use, inplace=True)

In [74]:
# create a clean_teams df that only has year, tmID, franchID, and playoff, and map Y to 1 and N to 0 in playoff
clean_teams = teams_df[['year', 'tmID', 'franchID', 'playoff']].copy()
clean_teams['playoff'] = clean_teams['playoff'].map({'Y': 1, 'N': 0})

In [75]:
# look at tmID to calculate the columns of the team statistics when merging with players_teams_awards, but look at the franchID to compare the rolling average of the team statistics

# Merge players_teams_awards with clean_teams on year and tmID
merged_df = pd.merge(clean_teams, players_teams_awards, on=['year', 'tmID'], how='left')

# Group by year and tmID in clean_teams and calculate the sum for score columns
teams_with_scores = merged_df.groupby(['year', 'tmID'], as_index=False).agg({
    'franchID': 'first',           # Keep the first franchID
    'playoff': 'first',            # Keep the first playoff value
    'award_score': 'sum',          # Sum award_score
    'weighted_score': 'sum',       # Sum weighted_score
    'post_weighted_score': 'sum'   # Sum post_weighted_score
})

In [76]:
# Add a sliding window for past 3 years' aggregated statistics
def calculate_rolling_features(df, columns, window=3):
    """
    Calculates rolling sum features for the given columns in the dataframe.

    Args:
        df (pd.DataFrame): Input dataframe.
        columns (list of str): List of column names to calculate rolling features for.
        window (int): Rolling window size. Default is 3.

    Returns:
        pd.DataFrame: Dataframe with added rolling features.
    """
    rolling_df = df.sort_values(['franchID', 'year']).copy()
    for col in columns:
        rolling_col_name = f'{col}_rolling_{window}'
        rolling_df[rolling_col_name] = (
            rolling_df.groupby('franchID')[col]
            .apply(lambda x: x.shift(1).rolling(window=window, min_periods=1).sum())
            .reset_index(level=0, drop=True)  # Align index with rolling_df
        )
    return rolling_df

# Columns to calculate rolling features for
rolling_columns = ['award_score', 'weighted_score', 'post_weighted_score']

# Calculate rolling features for the columns
teams_with_rolling_features = calculate_rolling_features(teams_with_scores, rolling_columns)

In [77]:
# Filter data for training and testing
train_data = teams_with_rolling_features[
    (teams_with_rolling_features['year'] >= 4) & (teams_with_rolling_features['year'] <= 8)
    ]
test_data = teams_with_rolling_features[teams_with_rolling_features['year'] == 9]

# Define features and target variable
rolling_features = [f'{col}_rolling_3' for col in rolling_columns]

X_train = train_data[rolling_features]
y_train = train_data['playoff']

X_test = test_data[rolling_features]
y_test = test_data['playoff']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle any NaN values (if necessary)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [78]:
PLAYOFF_SPOTS = 8

In [79]:
# Train a Gradient Boosting Classifier as an example
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict probabilities instead of direct class labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of being class 1 (making playoffs)

# Create a copy of the test dataset including identifiers
predictions = test_data[['year', 'tmID', 'franchID']].copy()  # Include 'year', 'tmID', and 'franchID'
predictions[rolling_features] = X_test  # Add the rolling features

# Add probabilities and true labels
predictions['proba'] = y_proba  # Predicted probabilities
predictions['true_label'] = y_test.values  # True labels

# Sort by year and probability, descending
predictions = predictions.sort_values(by=['year', 'proba'], ascending=[True, False])

# Apply playoff cutoff per year
final_predictions = []
for year, group in predictions.groupby('year'):
    group['playoff_pred'] = 0  # Default to not making playoffs
    group.loc[group.head(PLAYOFF_SPOTS).index, 'playoff_pred'] = 1  # Top N teams make playoffs
    final_predictions.append(group)

# Combine results
final_predictions = pd.concat(final_predictions)

# Ensure output is ordered by year and franchise ID
final_predictions = final_predictions.sort_values(by=['year', 'franchID'])

# Generate list of playoff predictions per year
playoff_output = final_predictions.groupby('year')['playoff_pred'].apply(list)

# Example output for year 9
print(playoff_output[9])  # List of 0s and 1s for each team in year 9

# Extract predictions and true labels
y_pred_final = final_predictions['playoff_pred']
y_true_final = final_predictions['true_label']

# Evaluate performance
print("Precision:", precision_score(y_true_final, y_pred_final))
print("Recall:", recall_score(y_true_final, y_pred_final))
print("Accuracy:", accuracy_score(y_true_final, y_pred_final))
print(classification_report(y_true_final, y_pred_final))

[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]
Precision: 0.75
Recall: 0.75
Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.75      0.75      0.75         8

    accuracy                           0.71        14
   macro avg       0.71      0.71      0.71        14
weighted avg       0.71      0.71      0.71        14

