In [1]:
"""Identifying Outliers

    Task: Identify players who had unusually high or low performance in any season.
    Steps:
        Use statistical methods to detect outliers in performance metrics (e.g., points per game, assists).
        Investigate the context around these outliers (e.g., injuries, trades) to understand the reasons behind the anomalies."""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler
from scipy.stats import zscore

sns.set()

## Load the data

In [2]:
raw_data = pd.read_csv("../NBA/nba.csv") 

raw_data.rename(columns={"year": "SEASON", "Season_type": "SEASON_TYPE"}, inplace=True)

raw_data.describe(include="all")

Unnamed: 0,SEASON,SEASON_TYPE,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
count,8835,8835,8835.0,8835.0,8835,8835.0,8835,8835.0,8835.0,8835.0,...,8835.0,8835.0,8835.0,8835.0,8835.0,8835.0,8835.0,8835.0,8835.0,8835.0
unique,12,2,,,1568,,31,,,,...,,,,,,,,,,
top,2021-22,Regular%20Season,,,James Harden,,BOS,,,,...,,,,,,,,,,
freq,822,6259,,,24,,359,,,,...,,,,,,,,,,
mean,,,758252.8,217.041087,,1610613000.0,,36.999434,844.860441,138.528693,...,152.578834,82.459989,26.598755,16.975778,47.139898,70.188342,374.351896,424.678778,1.582568,0.628661
std,,,720185.5,150.467367,,8.609213,,28.227778,823.156717,159.765333,...,177.290498,115.60653,29.896816,25.662265,55.854539,66.013537,437.467232,478.94944,1.156207,0.556496
min,,,255.0,1.0,,1610613000.0,,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.0,0.0,0.0
25%,,,201950.0,92.0,,1610613000.0,,8.0,129.0,16.0,...,20.0,7.0,3.0,1.0,5.0,12.0,42.0,51.0,0.9,0.33
50%,,,203501.0,183.0,,1610613000.0,,34.0,526.0,72.0,...,86.0,35.0,15.0,7.0,25.0,48.0,191.0,227.0,1.46,0.53
75%,,,1628412.0,337.5,,1610613000.0,,65.0,1477.5,217.0,...,231.0,109.0,42.0,22.0,71.0,120.0,580.0,678.0,2.11,0.81


### Identify performance metrics

In [17]:
p_metrics = raw_data[["PTS", "AST",  "REB", "STL", "BLK"]]
p_metrics

Unnamed: 0,PTS,AST,REB,STL,BLK
0,2280,374,640,116,105
1,2133,469,433,106,25
2,2036,551,610,129,67
3,2023,455,379,142,38
4,1920,171,460,52,32
...,...,...,...,...,...
8830,0,1,3,0,0
8831,0,1,1,0,0
8832,0,1,3,0,0
8833,0,0,0,0,0


#### Calculate basic stats

In [18]:
p_metrics.mean()

PTS    374.351896
AST     82.459989
REB    152.578834
STL     26.598755
BLK     16.975778
dtype: float64

In [20]:
cols_to_scale = [col for col in p_metrics]
scaled_cols = [f"{col}_Z" for col in p_metrics]

### Identify outliers

In [27]:
# Check for NaNs and drop rows with NaNs in the specified columns
p_metrics = p_metrics.dropna(subset=cols_to_scale)

# Apply zscore to the selected columns, handling NaN values and zero standard deviation
scaled_values = p_metrics[cols_to_scale].apply(lambda x: zscore(x) if x.std() != 0 else x)

# Rename the columns of the scaled_values DataFrame
scaled_values.columns = scaled_cols

p_metrics.loc[:, scaled_cols] = scaled_values

# Define a threshold for outliers. How many standard deviations away from the mean a data point must be to be considered an outlier.
threshold = 3

outliers = p_metrics[(p_metrics["PTS_Z"].abs() > threshold) | (p_metrics["AST_Z"].abs() > threshold) 
    | (p_metrics["REB_Z"].abs() > threshold) | (p_metrics["STL_Z"].abs() > threshold) | (p_metrics["BLK_Z"].abs() > threshold)]
outliers

Unnamed: 0,PTS,AST,REB,STL,BLK,PTS_Z,AST_Z,REB_Z,STL_Z,BLK_Z
0,2280,374,640,116,105,4.356340,2.521973,2.749435,2.990496,3.430298
1,2133,469,433,106,25,4.020296,3.343772,1.581794,2.655993,0.312703
2,2036,551,610,129,67,3.798552,4.053115,2.580212,3.425349,1.949440
3,2023,455,379,142,38,3.768834,3.222665,1.277192,3.860203,0.819312
4,1920,171,460,52,32,3.533375,0.765917,1.734095,0.849679,0.585493
...,...,...,...,...,...,...,...,...,...,...
8190,790,485,179,71,18,0.950178,3.482180,0.149036,1.485234,0.039914
8196,771,478,289,78,9,0.906743,3.421627,0.769522,1.719386,-0.310815
8207,715,247,273,120,70,0.778727,1.423357,0.679269,3.124297,2.066350
8215,663,37,708,73,48,0.659854,-0.393253,3.133008,1.552134,1.209012


In [25]:
# Using IQR 
Q1 = p_metrics[cols_to_scale].quantile(0.25) 
Q3 = p_metrics[cols_to_scale].quantile(0.75)

IQR = Q3 - Q1 

iqr_outliers = p_metrics[((p_metrics[cols_to_scale] < (Q1 - 1.5 * IQR)) | (p_metrics[cols_to_scale] > (Q3 + 1.5 * IQR))).any(axis=1)]

iqr_outliers

Unnamed: 0,PTS,AST,REB,STL,BLK,PTS_Z,AST_Z,REB_Z,STL_Z,BLK_Z
0,2280,374,640,116,105,4.356340,2.521973,2.749435,2.990496,3.430298
1,2133,469,433,106,25,4.020296,3.343772,1.581794,2.655993,0.312703
2,2036,551,610,129,67,3.798552,4.053115,2.580212,3.425349,1.949440
3,2023,455,379,142,38,3.768834,3.222665,1.277192,3.860203,0.819312
4,1920,171,460,52,32,3.533375,0.765917,1.734095,0.849679,0.585493
...,...,...,...,...,...,...,...,...,...,...
8304,379,63,322,27,61,0.010626,-0.168339,0.955667,0.013422,1.715621
8316,354,90,136,113,49,-0.046525,0.065225,-0.093518,2.890145,1.247982
8323,334,67,261,23,61,-0.092245,-0.133737,0.611580,-0.120379,1.715621
8333,311,79,286,32,72,-0.144823,-0.029931,0.752599,0.180673,2.144290
