In [156]:
import pandas as pd
pd.options.display.max_columns = 100

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter, FuncFormatter

plt.style.use('ggplot')

import os
import warnings

warnings.filterwarnings('ignore')

In [157]:

path = "/Users/navyfish/Downloads/shots"

# Initialize an empty list to store dataframes
dataframes = []

# Iterate over files in the directory
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    try:
        # Read the CSV file and append the dataframe to the list
        data = pd.read_csv(file_path)
        dataframes.append(data)
        print(f"Successfully read {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Concatenate all dataframes in the list
df = pd.concat(dataframes, ignore_index=True)

Error reading .DS_Store: 'utf-8' codec can't decode byte 0xff in position 328: invalid start byte
Successfully read NBA_2023_Shots.csv
Successfully read NBA_2022_Shots.csv


In [158]:
# Remove duplicates
df = df.drop_duplicates()

# Group by GAME_ID, HOME_TEAM, and AWAY_TEAM, and count unique GAME_IDs
g = df.groupby(['GAME_ID', 'HOME_TEAM', 'AWAY_TEAM'])['GAME_ID'].nunique().reset_index(name='Count')

# Filter out groups where HOME_TEAM count is not equal to 1
filtered_g = g.groupby('GAME_ID').filter(lambda x: x['HOME_TEAM'].count() != 1)

# Remove rows with GAME_ID 22000000
df = df[df['GAME_ID'] != 22000000]


In [159]:
df

Unnamed: 0,SEASON_1,SEASON_2,TEAM_ID,TEAM_NAME,PLAYER_ID,PLAYER_NAME,POSITION_GROUP,POSITION,GAME_DATE,GAME_ID,HOME_TEAM,AWAY_TEAM,EVENT_TYPE,SHOT_MADE,ACTION_TYPE,SHOT_TYPE,BASIC_ZONE,ZONE_NAME,ZONE_ABB,ZONE_RANGE,LOC_X,LOC_Y,SHOT_DISTANCE,QUARTER,MINS_LEFT,SECS_LEFT
0,2023,2022-23,1610612764,Washington Wizards,203078,Bradley Beal,G,SG,10-19-2022,22200004,IND,WAS,Made Shot,True,Jump Shot,3PT Field Goal,Left Corner 3,Left Side,L,24+ ft.,23.40,12.950,24,1,11,20
1,2023,2022-23,1610612764,Washington Wizards,204001,Kristaps Porzingis,C,C,10-19-2022,22200004,IND,WAS,Missed Shot,False,Step Back Jump shot,3PT Field Goal,Above the Break 3,Center,C,24+ ft.,-0.40,31.450,26,1,10,52
2,2023,2022-23,1610612764,Washington Wizards,1628420,Monte Morris,G,PG,10-19-2022,22200004,IND,WAS,Made Shot,True,Running Finger Roll Layup Shot,2PT Field Goal,Restricted Area,Center,C,Less Than 8 ft.,-1.30,5.250,1,1,10,39
3,2023,2022-23,1610612764,Washington Wizards,204001,Kristaps Porzingis,C,C,10-19-2022,22200004,IND,WAS,Made Shot,True,Cutting Dunk Shot,2PT Field Goal,Restricted Area,Center,C,Less Than 8 ft.,-0.10,7.250,2,1,10,31
4,2023,2022-23,1610612764,Washington Wizards,1630166,Deni Avdija,F,SF,10-19-2022,22200004,IND,WAS,Made Shot,True,Cutting Dunk Shot,2PT Field Goal,Restricted Area,Center,C,Less Than 8 ft.,1.80,7.150,2,1,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433937,2022,2021-22,1610612749,Milwaukee Bucks,203507,Giannis Antetokounmpo,F,PF,10-19-2021,22100001,MIL,BKN,Missed Shot,False,Pullup Jump shot,3PT Field Goal,Above the Break 3,Center,C,24+ ft.,-0.00,8.405,26,1,11,13
433938,2022,2021-22,1610612744,Golden State Warriors,1629673,Jordan Poole,G,SG,10-19-2021,22100002,LAL,GSW,Made Shot,True,Pullup Jump shot,2PT Field Goal,Mid-Range,Center,C,16-24 ft.,-0.26,7.735,19,1,11,18
433939,2022,2021-22,1610612744,Golden State Warriors,1626172,Kevon Looney,C,C,10-19-2021,22100002,LAL,GSW,Made Shot,True,Cutting Layup Shot,2PT Field Goal,Restricted Area,Center,C,Less Than 8 ft.,-0.12,5.925,1,1,11,37
433940,2022,2021-22,1610612749,Milwaukee Bucks,1628960,Grayson Allen,G,SG,10-19-2021,22100001,MIL,BKN,Missed Shot,False,Jump Shot,3PT Field Goal,Above the Break 3,Left Side Center,LC,24+ ft.,-1.64,7.875,26,1,11,42


In [160]:
df.isna().sum()[df.isna().sum()>0]

POSITION_GROUP    2518
POSITION          2518
dtype: int64

In [161]:
df.nunique()

SEASON_1             2
SEASON_2             2
TEAM_ID             30
TEAM_NAME           31
PLAYER_ID          700
PLAYER_NAME        706
POSITION_GROUP       3
POSITION            13
GAME_DATE          329
GAME_ID           2460
HOME_TEAM           30
AWAY_TEAM           30
EVENT_TYPE           2
SHOT_MADE            2
ACTION_TYPE         48
SHOT_TYPE            2
BASIC_ZONE           7
ZONE_NAME            6
ZONE_ABB             6
ZONE_RANGE           5
LOC_X              933
LOC_Y             1355
SHOT_DISTANCE       85
QUARTER              7
MINS_LEFT           13
SECS_LEFT           60
dtype: int64

In [162]:
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format='%m-%d-%Y', errors='coerce')

In [163]:
print(df.dtypes)

SEASON_1                   int64
SEASON_2                  object
TEAM_ID                    int64
TEAM_NAME                 object
PLAYER_ID                  int64
PLAYER_NAME               object
POSITION_GROUP            object
POSITION                  object
GAME_DATE         datetime64[ns]
GAME_ID                    int64
HOME_TEAM                 object
AWAY_TEAM                 object
EVENT_TYPE                object
SHOT_MADE                   bool
ACTION_TYPE               object
SHOT_TYPE                 object
BASIC_ZONE                object
ZONE_NAME                 object
ZONE_ABB                  object
ZONE_RANGE                object
LOC_X                    float64
LOC_Y                    float64
SHOT_DISTANCE              int64
QUARTER                    int64
MINS_LEFT                  int64
SECS_LEFT                  int64
dtype: object


In [164]:
a = df.copy()

In [165]:
output_path = '/Users/navyfish/Downloads/cleaned_shots.csv'
a.to_csv(output_path, index=False)