In [32]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# Load the datasets
awards_players_df = pd.read_csv('../dataset/awards_players.csv')
coaches_df = pd.read_csv('../dataset/coaches.csv')
players_df = pd.read_csv('../dataset/players.csv')
players_teams_df = pd.read_csv('../dataset/players_teams.csv')
series_post_df = pd.read_csv('../dataset/series_post.csv')
teams_df = pd.read_csv('../dataset/teams.csv')
teams_post_df = pd.read_csv('../dataset/teams_post.csv')

# Remove useless columns from the datasets
awards_players_df = awards_players_df.drop(columns=['lgID'])
players_df = players_df.drop(columns=['firstseason', 'lastseason', 'deathDate'])
coaches_df = coaches_df.drop(columns=['lgID'])
series_post_df = series_post_df.drop(columns=['lgIDLoser', 'lgIDWinner'])
teams_post_df = teams_post_df.drop(columns=['lgID'])
teams_df = teams_df.drop(
    columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
players_teams_df = players_teams_df.drop(columns=['lgID'])

In [33]:
# Merge players, teams, and awards data
players_teams_merged = pd.merge(players_df, players_teams_df, left_on='bioID', right_on='playerID')
players_teams_awards = pd.merge(players_teams_merged, awards_players_df, on=['year', 'playerID'], how='left')

#remove pos, height, weight, college, collegeOther, birthDate, playerID, GP and GS
players_teams_awards = players_teams_awards.drop(
    columns=['pos', 'height', 'weight', 'college', 'collegeOther', 'birthDate', 'playerID', 'GP', 'GS', 'PostGP', 'PostGS'])


In [34]:
# Define award scores
award_scores = {
    'All-Star Game Most Valuable Player': 7,
    'Coach of the Year': 10,
    'Defensive Player of the Year': 7,
    'Kim Perrot Sportsmanship Award': 0,
    'Most Improved Player': 5,
    'Most Valuable Player': 10,
    'Rookie of the Year': 5,
    'Sixth Woman of the Year': 6,
    'WNBA Finals Most Valuable Player': 8,
    'WNBA All-Decade Team': 6,
    'WNBA All Decade Team Honorable Mention': 4
}

# Map the award scores to the dataframe
players_teams_awards['award_score'] = players_teams_awards['award'].map(award_scores).fillna(0)

# drop award column
players_teams_awards = players_teams_awards.drop(columns=['award'])

# List of columns to group by (excluding 'award_score')
columns_to_group_by = ['bioID', 'year', 'stint']

# Group by the columns and aggregate
players_teams_awards = players_teams_awards.groupby(columns_to_group_by).agg({
    'award_score': 'sum',  # Sum award scores
    **{col: 'first' for col in players_teams_awards.columns if col not in columns_to_group_by + ['award_score']}
}).reset_index()

In [35]:
# Define coefficients for player statistics
coefficients = {
    'minutes': 0.2,
    'points': 0.45,
    'oRebounds': 0.1,
    'dRebounds': 0.1,
    'rebounds': 0.15,
    'assists': 0.25,
    'steals': 0.2,
    'blocks': 0.35,
    'turnovers': -0.3,  # Negative coefficient for turnovers
    'PF': 0.1,
    'fgAttempted': 0.05,
    'fgMade': 0.1,
    'ftAttempted': 0.1,
    'ftMade': 0.2,
    'threeAttempted': 0.15,
    'threeMade': 0.25,
    'dq': -0.4  # Negative coefficient for dq
}

# List of columns to be used in the weighted sum calculation
columns_to_use = list(coefficients.keys())


# Function to calculate the weighted sum based on coefficients
def calculate_weighted_sum(row):
    total = 0
    for col in columns_to_use:
        total += row[col] * coefficients[col]
    return total


# Apply the function to calculate the weighted sum and store it in a new column
players_teams_awards['weighted_score'] = players_teams_awards.apply(calculate_weighted_sum, axis=1)

# Define coefficients for post-season statistics
post_coefficients = {
    'PostMinutes': 0.2,
    'PostPoints': 0.45,
    'PostoRebounds': 0.1,
    'PostdRebounds': 0.1,
    'PostRebounds': 0.15,
    'PostAssists': 0.25,
    'PostSteals': 0.2,
    'PostBlocks': 0.35,
    'PostTurnovers': -0.3,  # Negative coefficient for turnovers
    'PostPF': 0.1,
    'PostfgAttempted': 0.05,
    'PostfgMade': 0.1,
    'PostftAttempted': 0.1,
    'PostftMade': 0.2,
    'PostthreeAttempted': 0.15,
    'PostthreeMade': 0.25,
    'PostDQ': -0.4  # Negative coefficient for dq
}

# List of 'Post' columns to be used in the weighted sum calculation
post_columns_to_use = list(post_coefficients.keys())


# Function to calculate the weighted sum based on 'Post' coefficients
def calculate_post_weighted_sum(row):
    total = 0
    for col in post_columns_to_use:
        total += row[col] * post_coefficients[col]
    return total


# Apply the function to calculate the post-season weighted sum and store it in a new column
players_teams_awards['post_weighted_score'] = players_teams_awards.apply(calculate_post_weighted_sum, axis=1)

# Remove the individual columns used in the calculation
players_teams_awards.drop(columns=columns_to_use, inplace=True)

# Remove the individual 'Post' columns used in the calculation
players_teams_awards.drop(columns=post_columns_to_use, inplace=True)

# add the franchID column to the players_teams_awards (check the tmID and year and add it)
players_teams_awards = pd.merge(players_teams_awards, teams_df[['year', 'tmID', 'franchID']], on=['year', 'tmID'], how='left')

In [36]:
# create a clean_teams df that only has year, tmID, franchID, and playoff, and map Y to 1 and N to 0 in playoff
clean_teams = teams_df[['year', 'tmID', 'confID', 'playoff']].copy()
clean_teams['playoff'] = clean_teams['playoff'].map({'Y': 1, 'N': 0})

In [37]:
# look at tmID to calculate the columns of the team statistics when merging with players_teams_awards, but look at the franchID to compare the rolling average of the team statistics

def calculate_rolling_features(df, columns, window=3):
    """
    Calculates rolling average features for the given columns in the dataframe,
    handling duplicates and considering 'stint' as part of the aggregation.

    Args:
        df (pd.DataFrame): Input dataframe.
        columns (list of str): List of column names to calculate rolling features for.
        window (int): Rolling window size. Default is 3.

    Returns:
        pd.DataFrame: Dataframe with added rolling features, including tmID and franchID.
    """
    # Keep tmID and franchID for merging later
    id_columns = ['bioID', 'year', 'tmID', 'franchID']

    # Aggregate duplicate rows for the same bioID, year pair
    aggregated_df = (
        df.groupby(['bioID', 'year'])[columns]
        .sum()  # Sum scores across stints
        .reset_index()
    )

    # Add tmID and franchID back after aggregating
    unique_id_data = df[id_columns].drop_duplicates(subset=['bioID', 'year'])
    aggregated_df = aggregated_df.merge(unique_id_data, on=['bioID', 'year'], how='left')

    # Sort by bioID and year
    aggregated_df = aggregated_df.sort_values(['bioID', 'year']).copy()

    for col in columns:
        rolling_col_name = f'{col}_rolling_{window}'

        def rolling_avg(group):
            values = group.shift(1)  # Exclude current season by shifting
            filtered = values.replace(0, np.nan)  # Replace zeros with NaN
            return (
                filtered.rolling(window=window, min_periods=1)
                .mean()  # Calculate rolling mean, ignoring NaN
            )

        aggregated_df[rolling_col_name] = (
            aggregated_df.groupby('bioID')[col]
            .apply(rolling_avg)
            .reset_index(level=0, drop=True)  # Align index with aggregated_df
        )

    return aggregated_df

# Columns to calculate rolling features for
rolling_columns = ['award_score', 'weighted_score', 'post_weighted_score']

player_rolling_features = calculate_rolling_features(
    players_teams_awards,
    columns=rolling_columns
)

# replace NaN values with 0
player_rolling_features = player_rolling_features.fillna(0)

# Merge with clean_teams on year and tmID
merged_df = pd.merge(clean_teams, player_rolling_features, on=['year', 'tmID'], how='left')

# Aggregate team-level statistics by year and tmID
teams_with_rolling_aggregated = merged_df.groupby(['year', 'tmID'], as_index=False).agg({
    'franchID': 'first',  # Keep the first franchID
    'playoff': 'first',   # Keep the first playoff value
    'confID': 'first',
    'award_score_rolling_3': 'sum',
    'weighted_score_rolling_3': 'sum',
    'post_weighted_score_rolling_3': 'sum'
})


In [38]:
# Define the rolling columns for features
rolling_columns_aggregated = [
    'award_score_rolling_3',
    'weighted_score_rolling_3',
    'post_weighted_score_rolling_3'
]

# Filter data for training and testing
train_data = teams_with_rolling_aggregated[
    (teams_with_rolling_aggregated['year'] >= 4) & (teams_with_rolling_aggregated['year'] <= 8)
    ]
test_data = teams_with_rolling_aggregated[teams_with_rolling_aggregated['year'] >= 9]

# Prepare features and labels
X_train = train_data[rolling_columns_aggregated].fillna(0)
y_train = train_data['playoff']

X_test = test_data[rolling_columns_aggregated].fillna(0)
y_test = test_data['playoff']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle any NaN values (if necessary)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [39]:
PLAYOFF_SPOTS = 8

In [40]:
# Train a Logistic Regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Predict probabilities for class 1 (making playoffs)
y_proba = clf.predict_proba(X_test)[:, 1]

# Create a copy of the test dataset including identifiers
predictions = test_data[['year', 'tmID', 'franchID', 'confID']].copy()  # Include 'year', 'tmID', and 'franchID'
predictions[rolling_columns_aggregated] = X_test  # Add the rolling features
predictions['proba'] = y_proba  # Predicted probabilities
predictions['true_label'] = y_test.values  # True labels

# Sort by year, confID, and probability, descending
predictions = predictions.sort_values(by=['year', 'confID', 'proba'], ascending=[True, True, False])

# Apply playoff cutoff per conference and year using probabilities
final_predictions = []
PLAYOFF_SPOTS_PER_CONF = 4

for (year, confID), group in predictions.groupby(['year', 'confID']):
    group['playoff_pred'] = 0  # Default to not making playoffs
    group.loc[group.head(PLAYOFF_SPOTS_PER_CONF).index, 'playoff_pred'] = 1  # Top 4 in each conference
    final_predictions.append(group)

# Combine results
final_predictions = pd.concat(final_predictions)

# Ensure output is ordered by year and franchise ID
final_predictions = final_predictions.sort_values(by=['year', 'franchID'])

# Generate probabilities for playoffs per team, ordered by year and franchise
playoff_probs = final_predictions[['year', 'tmID', 'franchID', 'proba', 'true_label']]

# Example output: Top probabilities for teams in year 9
example_year = 9
example_probs = playoff_probs[playoff_probs['year'] == example_year].sort_values(by='proba', ascending=False)
print(example_probs)

# Extract final predictions and true labels
y_pred_final = final_predictions['playoff_pred']
y_true_final = final_predictions['true_label']

# Evaluate performance metrics
print("Precision:", precision_score(y_true_final, y_pred_final))
print("Recall:", recall_score(y_true_final, y_pred_final))
print("Accuracy:", accuracy_score(y_true_final, y_pred_final))
print("F1 Score:", f1_score(y_true_final, y_pred_final))
print(classification_report(y_true_final, y_pred_final))

# Full probabilities for a specific year (e.g., year 9)
playoff_probs_year_9 = playoff_probs[playoff_probs['year'] == 9]
print(playoff_probs_year_9)

     year tmID franchID     proba  true_label
118     9  DET      DET  0.973711           1
127     9  SEA      SEA  0.962562           1
124     9  PHO      PHO  0.940952           0
121     9  LAS      LAS  0.870150           1
120     9  IND      IND  0.777344           1
125     9  SAC      SAC  0.735016           1
119     9  HOU      HOU  0.598628           0
126     9  SAS      SAS  0.508239           1
123     9  NYL      NYL  0.498080           1
117     9  CON      CON  0.445871           1
116     9  CHI      CHI  0.369860           0
115     9  ATL      ATL  0.362886           0
128     9  WAS      WAS  0.354687           0
122     9  MIN      MIN  0.328893           0
Precision: 0.8125
Recall: 0.8125
Accuracy: 0.7777777777777778
F1 Score: 0.8125
              precision    recall  f1-score   support

           0       0.73      0.73      0.73        11
           1       0.81      0.81      0.81        16

    accuracy                           0.78        27
   macro avg 

In [41]:
# Import the season 11 data
teams_df_11 = pd.read_csv('../dataset/Season_11/teams.csv')
players_df_11 = pd.read_csv('../dataset/Season_11/players_teams.csv')
coaches_df_11 = pd.read_csv('../dataset/Season_11/coaches.csv')

#drop columns
teams_df_11 = teams_df_11.drop(columns=['lgID', 'name', 'arena', 'franchID'])
coaches_df_11 = coaches_df_11.drop(columns=['lgID', 'stint'])
players_df_11 = players_df_11.drop(columns=['lgID', 'stint'])

# for each player, look for their bioID in player_rolling_features and add the weighted_score, award_score, and post_weighted_score for years 8, 9, and 10. If the player is not found, add 0 for all three columns, and if the player is found but the year is not found, add 0 for the missing year(s).

rolling_years = [8, 9, 10]

# Initialize new columns in players_df_11 with float64 data type
for year in rolling_years:
    players_df_11[f'weighted_score_y{year}'] = 0.0
    players_df_11[f'award_score_y{year}'] = 0.0
    players_df_11[f'post_weighted_score_y{year}'] = 0.0

# Iterate over each row in players_df_11
for index, row in players_df_11.iterrows():
    bioID = row['playerID']

    # Filter player_rolling_features for the current player
    player_data = player_rolling_features[player_rolling_features['bioID'] == bioID]

    # For each rolling year, fetch scores or assign 0 if not available
    for year in rolling_years:
        year_data = player_data[player_data['year'] == year]
        if not year_data.empty:
            players_df_11.at[index, f'weighted_score_y{year}'] = float(year_data['weighted_score'].iloc[0])
            players_df_11.at[index, f'award_score_y{year}'] = float(year_data['award_score'].iloc[0])
            players_df_11.at[index, f'post_weighted_score_y{year}'] = float(year_data['post_weighted_score'].iloc[0])

# Print the updated DataFrame for verification
print(players_df_11.head())

def calculate_player_rolling_features11(df):
    """
    Calculates the 3-year average of weighted_score, award_score, and post_weighted_score
    for each player across years 8, 9, and 10.

    Args:
        df (pd.DataFrame): Input DataFrame with columns:
            - playerID
            - year
            - tmID
            - weighted_score_y8, award_score_y8, post_weighted_score_y8
            - weighted_score_y9, award_score_y9, post_weighted_score_y9
            - weighted_score_y10, award_score_y10, post_weighted_score_y10

    Returns:
        pd.DataFrame: DataFrame with added rolling average columns:
            - weighted_score_rolling_3
            - award_score_rolling_3
            - post_weighted_score_rolling_3
    """
    # List of the years to include in the rolling calculation
    rolling_years = [8, 9, 10]

    # Initialize rolling columns
    df['weighted_score_rolling_3'] = 0.0
    df['award_score_rolling_3'] = 0.0
    df['post_weighted_score_rolling_3'] = 0.0

    # Calculate the rolling averages
    for index, row in df.iterrows():
        scores = {
            'weighted_score': [],
            'award_score': [],
            'post_weighted_score': []
        }

        # Collect scores for years 8, 9, and 10
        for year in rolling_years:
            for key in scores.keys():
                column_name = f'{key}_y{year}'
                if column_name in df.columns:
                    scores[key].append(row[column_name])

        # Calculate the averages ignoring zeros
        for key, values in scores.items():
            rolling_avg = (
                sum(value for value in values if value != 0) / len(values)
                if any(value != 0 for value in values)
                else 0
            )
            df.at[index, f'{key}_rolling_3'] = rolling_avg

    return df

# Calculate rolling features for players_df_11
players_df_11 = calculate_player_rolling_features11(players_df_11)

#drop old columns
players_df_11 = players_df_11.drop(columns=[
    'weighted_score_y8', 'award_score_y8', 'post_weighted_score_y8',
    'weighted_score_y9', 'award_score_y9', 'post_weighted_score_y9',
    'weighted_score_y10', 'award_score_y10', 'post_weighted_score_y10'
])

# Print the updated DataFrame for verification
print(players_df_11.head())

# Aggregate player scores by team
team_scores = players_df_11.groupby(['year', 'tmID']).agg({
    'weighted_score_rolling_3': 'sum',
    'award_score_rolling_3': 'sum',
    'post_weighted_score_rolling_3': 'sum'
}).reset_index()
##

     playerID  year tmID  weighted_score_y8  award_score_y8  \
0  adairje01w    11  MIN                0.0             0.0   
1  adamsda01w    11  SAS                0.0             0.0   
2  ajavoma01w    11  WAS                0.0             0.0   
3  anosini01w    11  WAS                0.0             0.0   
4  appelja01w    11  SAS                0.0             0.0   

   post_weighted_score_y8  weighted_score_y9  award_score_y9  \
0                     0.0               0.00             0.0   
1                     0.0               0.00             0.0   
2                     0.0             333.85             0.0   
3                     0.0             484.60             0.0   
4                     0.0               0.00             0.0   

   post_weighted_score_y9  weighted_score_y10  award_score_y10  \
0                     0.0                0.00              0.0   
1                     0.0                0.00              0.0   
2                     0.0             