# NBA Player Statistical Analysis

This notebook filters NBA regular season data, analyzes the player with the most seasons played, evaluates three-point accuracy, and performs various statistical analyses.

In [1]:

import zipfile
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.integrate import quad
import os
import glob

# Automatically find the zip file in the current directory
zip_files = glob.glob("*.zip")

if not zip_files:
    raise FileNotFoundError("No zip file found in the current directory. Please place the dataset zip file here.")
    
zip_path = zip_files[0]  # Use the first detected zip file
extract_path = "extracted_data"

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Automatically detect the CSV file within the extracted folder
csv_files = glob.glob(os.path.join(extract_path, "*.csv"))

if not csv_files:
    raise FileNotFoundError("No CSV file found in the extracted dataset folder.")
    
file_path = csv_files[0]  # Use the first detected CSV file
df = pd.read_csv(file_path)

# Filter for NBA regular season data
nba_df = df[(df['League'] == 'NBA') & (df['Stage'] == 'Regular_Season')]

# Identify the player who played the most seasons
player_counts = nba_df['Player'].value_counts()
most_seasons_player = player_counts.idxmax()
player_df = nba_df[nba_df['Player'] == most_seasons_player]

# Convert season to numerical values (extracting the first year as integer)
player_df['Season'] = player_df['Season'].str.split(" - ").str[0].astype(int)

# Calculate three-point accuracy per season
player_df['3P Accuracy'] = player_df['3PM'] / player_df['3PA']
player_df = player_df.dropna(subset=['3P Accuracy'])

# Perform linear regression for three-point accuracy over seasons
X = player_df[['Season']]
y = player_df['3P Accuracy']
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)

# Create a line of best fit
plt.scatter(player_df['Season'], player_df['3P Accuracy'], label="Actual Data", color="orange")
plt.plot(player_df['Season'], y_pred, color='red', label="Best Fit Line")
plt.xlabel("Season")
plt.ylabel("Three-Point Accuracy")
plt.legend()
plt.title(f"Three-Point Accuracy Over Time for {most_seasons_player}")
plt.show()

# Calculate the average three-point accuracy using integration
def regression_line(x):
    return regressor.coef_[0] * x + regressor.intercept_

season_range = (player_df['Season'].min(), player_df['Season'].max())
integrated_value, _ = quad(regression_line, *season_range)
average_integrated_accuracy = integrated_value / (season_range[1] - season_range[0])

# Compare to actual average three-point accuracy
actual_average_accuracy = player_df['3P Accuracy'].mean()

# Interpolate missing values (2002-03 and 2015-16)
missing_seasons = [2002, 2015]
interpolated_values = regressor.predict(np.array(missing_seasons).reshape(-1, 1))

# Compute statistical measures for FGM and FGA
fgm_stats = {
    "mean": player_df['FGM'].mean(),
    "variance": player_df['FGM'].var(),
    "skew": stats.skew(player_df['FGM']),
    "kurtosis": stats.kurtosis(player_df['FGM'])
}

fga_stats = {
    "mean": player_df['FGA'].mean(),
    "variance": player_df['FGA'].var(),
    "skew": stats.skew(player_df['FGA']),
    "kurtosis": stats.kurtosis(player_df['FGA'])
}

# Perform t-tests
t_relational = stats.ttest_rel(player_df['FGM'], player_df['FGA'])
t_individual_fgm = stats.ttest_1samp(player_df['FGM'], 0)
t_individual_fga = stats.ttest_1samp(player_df['FGA'], 0)

# Display results
results = {
    "Most Seasons Player": most_seasons_player,
    "Integrated Avg 3P Accuracy": average_integrated_accuracy,
    "Actual Avg 3P Accuracy": actual_average_accuracy,
    "Missing Season Interpolations": dict(zip(missing_seasons, interpolated_values)),
    "FGM Stats": fgm_stats,
    "FGA Stats": fga_stats,
    "T-test Related (FGM vs FGA)": t_relational,
    "T-test Individual (FGM)": t_individual_fgm,
    "T-test Individual (FGA)": t_individual_fga
}

# Display filtered player data
player_df.head(), results


FileNotFoundError: No CSV file found in the extracted dataset folder.