<a href="https://colab.research.google.com/github/BlackPhosphorus/determiningclutchNBA/blob/main/Determining_Clutch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://www.kaggle.com/datasets/szymonjwiak/nba-play-by-play-data-1997-2023

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
uploaded = files.upload()

In [None]:
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)
print(data.sample(5))

In [None]:
data['clock'] = data['clock'].str.replace('PT', '').str.replace('M', ':').str.replace('S', '')
def convert_time(time_str):
    try:
        return pd.to_datetime(time_str, format='%M:%S.%f').time()
    except ValueError:
        try:
            return pd.to_datetime(time_str, format='%M:%S').time()
        except ValueError:
            return pd.to_datetime(time_str + ':00', format='%M:%S').time()
data['clock'] = data['clock'].apply(convert_time)
# individually sorting and converting clock times for standard data

In [None]:
print(data[['clock']].head())

In [None]:
def is_clutch(row):
    return (row['period'] == 4) and (row['clock'] <= pd.to_datetime('05:00', format='%M:%S').time()) and (abs(row['h_pts'] - row['a_pts']) <= 5)
print(data['clock'].apply(type).unique())  # theoretically this should return <class 'datetime.time'>
data['IS_CLUTCH'] = data.apply(is_clutch, axis=1)
#simple definition of "clutch", will be changed later

In [None]:
min_clutch_events = 10
min_total_plays = 100

clutch_data = data[data['IS_CLUTCH']].groupby(['playerid', 'player']).agg({
    'gameid': 'count',
    'play' : 'count'
}).rename(columns={'gameid': 'clutch_events', 'play': 'clutch_plays'}).reset_index()
non_clutch_data = data[~data['IS_CLUTCH']].groupby(['playerid', 'player']).agg({
    'gameid': 'count',
    'play' : 'count'
}).rename(columns={'gameid': 'non_clutch_events', 'play': 'non_clutch_plays'}).reset_index()
merged_data['total_plays'] = merged_data['clutch_plays'] + merged_data['non_clutch_plays']
filtered_data = merged_data[(merged_data['clutch_events'] >= min_clutch_events) & (merged_data['total_plays'] >= min_total_plays)]
filtered_data['clutch_ratio'] = filtered_data['clutch_events'] / (filtered_data['clutch_events'] + filtered_data['non_clutch_events'])
# setting up difference and filtering for data cleanup

In [None]:
X = filtered_data[['clutch_events', 'clutch_ratio']]
model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X)
filtered_data['anomaly'] = model.predict(X)
# testing very basic iso forest, this will be scrapped or changed completely lol

In [None]:
clutch_players = filtered_data[filtered_data['anomaly'] == -1]
plt.figure(figsize=(14, 10))
sns.set(style="whitegrid")
plt.scatter(filtered_data['clutch_events'], filtered_data['clutch_ratio'], alpha=0.3, s=80, label='Non-clutch', c='blue')
plt.scatter(clutch_players['clutch_events'], clutch_players['clutch_ratio'], color='red', s=100, label='Clutch')

for _, row in clutch_players.iterrows():
    plt.annotate(row['player'],
                 (row['clutch_events'], row['clutch_ratio']),
                 textcoords="offset points",
                 xytext=(5,5),
                 ha='center',
                 fontsize=10,
                 color='darkred',
                 bbox=dict(facecolor='white', alpha=0.6, edgecolor='none', boxstyle='round,pad=0.3'))

plt.title('NBA Player Clutch Performance Detection (2001)', fontsize=16)
plt.xlabel('Number of Clutch Events', fontsize=14)
plt.ylabel('Clutch Event Ratio', fontsize=14)

plt.legend(title='Player Type', fontsize=12, title_fontsize='13', loc='upper right')