In [1]:
import pandas as pd

In [2]:
columns = [
    "team_name",
    "player_name",
    "position_group",
    "position",
    "game_date",
    "home_team",
    "away_team",
    "event_type",
    "shot_made",
    "action_type",
    "shot_type",
    "basic_zone",
    "zone_name",
    "zone_abb",
    "zone_range",
    "loc_x",
    "loc_y",
    "shot_distance",
    "quarter",
    "mins_left",
    "secs_left"
]

shots_df = pd.read_csv("shots.csv", names=columns)

# Display the first few rows to verify
shots_df.head()

Unnamed: 0,team_name,player_name,position_group,position,game_date,home_team,away_team,event_type,shot_made,action_type,...,basic_zone,zone_name,zone_abb,zone_range,loc_x,loc_y,shot_distance,quarter,mins_left,secs_left
0,Los Angeles Lakers,Anthony Davis,C,C,2021-12-09,MEM,LAL,Made Shot,True,Dunk Shot,...,Restricted Area,Center,C,Less Than 8 ft.,0.21,5.925,2,4,2,21
1,Memphis Grizzlies,Desmond Bane,G,SG,2021-12-09,MEM,LAL,Missed Shot,False,Step Back Jump shot,...,Above the Break 3,Center,C,24+ ft.,0.11,8.395,26,4,2,27
2,Denver Nuggets,Will Barton,G,SG,2021-12-09,SAS,DEN,Missed Shot,False,Step Back Jump shot,...,Left Corner 3,Left Side,L,24+ ft.,-2.33,6.185,23,4,2,3
3,Utah Jazz,Jared Butler,G,SG,2021-12-09,PHI,UTA,Made Shot,True,Driving Finger Roll Layup Shot,...,Restricted Area,Center,C,Less Than 8 ft.,0.1,5.855,1,4,0,30
4,San Antonio Spurs,Dejounte Murray,G,PG,2021-12-09,SAS,DEN,Made Shot,True,Driving Floating Jump Shot,...,In The Paint (Non-RA),Left Side,L,8-16 ft.,-0.72,6.605,10,4,2,24


In [3]:
shots_df.shape

(433585, 21)

In [4]:
shots_df.columns

Index(['team_name', 'player_name', 'position_group', 'position', 'game_date',
       'home_team', 'away_team', 'event_type', 'shot_made', 'action_type',
       'shot_type', 'basic_zone', 'zone_name', 'zone_abb', 'zone_range',
       'loc_x', 'loc_y', 'shot_distance', 'quarter', 'mins_left', 'secs_left'],
      dtype='object')

In [5]:
# Check if game_date and player_name could uniquely identify rows
unique_rows = shots_df[['game_date', 'player_name']].drop_duplicates()
print(len(unique_rows) == len(shots_df))

False


In [6]:
# Group by player_name and check if team_name is unique within each player_name
unique_team_names = shots_df.groupby('player_name')['team_name'].nunique()
print(unique_team_names[unique_team_names > 1])

player_name
A.J. Lawson       2
Aaron Holiday     3
Aaron Nesmith     2
Alec Burks        2
Alize Johnson     4
                 ..
Wenyen Gabriel    2
Will Barton       3
Xavier Moon       2
Xavier Sneed      3
Yuta Watanabe     2
Name: team_name, Length: 240, dtype: int64


In [7]:
shots_df_grouped = shots_df.groupby(["game_date", "player_name", "quarter", "mins_left", "secs_left"])
counts = shots_df_grouped.size()
(counts == 1).all()

True

In [8]:
game_data = pd.read_csv("game_info.csv")

In [9]:
game_data

Unnamed: 0,game_id,season,date,away_team,away_score,home_team,home_score,result
0,131410290001,1314,2013-10-29,ORL,87,IND,97,1
1,131410290002,1314,2013-10-29,CHI,95,MIA,107,1
2,131410290003,1314,2013-10-29,LAC,103,LAL,116,1
3,131410300004,1314,2013-10-30,BRK,94,CLE,98,1
4,131410300005,1314,2013-10-30,BOS,87,TOR,93,1
...,...,...,...,...,...,...,...,...
11974,222304091226,2223,2023-04-09,UTA,117,LAL,128,1
11975,222304091227,2223,2023-04-09,NOP,108,MIN,113,1
11976,222304091228,2223,2023-04-09,MEM,100,OKC,115,1
11977,222304091229,2223,2023-04-09,LAC,119,PHO,114,0


In [10]:
#check matches can be done on home_team and away_team
game_home = game_data["home_team"].unique()
shots_home = shots_df["home_team"].unique()
home_teams = set(game_home.tolist() + shots_home.tolist())
home_teams


{'ATL',
 'BKN',
 'BOS',
 'BRK',
 'CHA',
 'CHI',
 'CHO',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NOP',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHO',
 'PHX',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS'}

In [11]:
#do the same for away teams
game_away = game_data["away_team"].unique()
shots_away = shots_df["away_team"].unique()
away_teams = set(game_away.tolist() + shots_away.tolist())
away_teams

{'ATL',
 'BKN',
 'BOS',
 'BRK',
 'CHA',
 'CHI',
 'CHO',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NOP',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHO',
 'PHX',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS'}

In [12]:
#do the same for game dates
game_dates = game_data["date"].unique()
shots_dates = shots_df["game_date"].unique()
dates = set(game_dates.tolist() + shots_dates.tolist())
dates

{'2014-12-28',
 '2015-12-23',
 '2018-04-05',
 '2019-10-30',
 '2020-01-22',
 '2019-12-14',
 '2022-02-01',
 '2019-01-22',
 '2015-12-20',
 '2018-03-01',
 '2019-01-06',
 '2013-12-08',
 '2014-12-09',
 '2021-05-16',
 '2022-10-25',
 '2022-12-04',
 '2016-04-09',
 '2022-11-15',
 '2018-02-25',
 '2022-11-02',
 '2016-12-09',
 '2017-03-25',
 '2021-04-04',
 '2022-11-17',
 '2022-03-26',
 '2018-11-21',
 '2018-04-04',
 '2023-03-24',
 '2017-12-05',
 '2015-01-26',
 '2016-11-14',
 '2021-11-15',
 '2017-03-07',
 '2017-11-17',
 '2019-12-08',
 '2017-04-05',
 '2019-01-20',
 '2017-11-25',
 '2022-01-09',
 '2013-12-13',
 '2022-02-10',
 '2018-10-28',
 '2013-11-21',
 '2016-12-07',
 '2020-02-24',
 '2018-10-31',
 '2018-11-26',
 '2017-12-22',
 '2019-11-13',
 '2020-03-08',
 '2017-11-30',
 '2014-03-21',
 '2018-10-22',
 '2019-01-17',
 '2015-03-04',
 '2018-04-11',
 '2022-12-13',
 '2015-11-24',
 '2013-12-30',
 '2021-04-28',
 '2022-11-13',
 '2017-11-09',
 '2023-02-06',
 '2018-11-04',
 '2017-11-28',
 '2015-12-17',
 '2020-02-

In [13]:
shots_df_updated = shots_df[["game_date", "player_name", "team_name",
    "shot_made",
    "action_type",
    "shot_type",
    "basic_zone",
    "zone_name",
    "zone_abb",
    "zone_range",
    "loc_x",
    "loc_y",
    "shot_distance",
    "quarter",
    "mins_left",
    "secs_left"]]
shots_df_updated

Unnamed: 0,game_date,player_name,team_name,shot_made,action_type,shot_type,basic_zone,zone_name,zone_abb,zone_range,loc_x,loc_y,shot_distance,quarter,mins_left,secs_left
0,2021-12-09,Anthony Davis,Los Angeles Lakers,True,Dunk Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.21,5.925,2,4,2,21
1,2021-12-09,Desmond Bane,Memphis Grizzlies,False,Step Back Jump shot,3PT Field,Above the Break 3,Center,C,24+ ft.,0.11,8.395,26,4,2,27
2,2021-12-09,Will Barton,Denver Nuggets,False,Step Back Jump shot,3PT Field,Left Corner 3,Left Side,L,24+ ft.,-2.33,6.185,23,4,2,3
3,2021-12-09,Jared Butler,Utah Jazz,True,Driving Finger Roll Layup Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.10,5.855,1,4,0,30
4,2021-12-09,Dejounte Murray,San Antonio Spurs,True,Driving Floating Jump Shot,2PT Field,In The Paint (Non-RA),Left Side,L,8-16 ft.,-0.72,6.605,10,4,2,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433580,2021-12-09,Russell Westbrook,Los Angeles Lakers,False,Pullup Jump shot,3PT Field,Above the Break 3,Right Side Center,RC,24+ ft.,1.68,7.655,25,4,1,45
433581,2021-12-09,De'Anthony Melton,Memphis Grizzlies,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.34,7.945,25,4,1,55
433582,2021-12-09,Jaren Jackson Jr.,Memphis Grizzlies,False,Jump Shot,3PT Field,Right Corner 3,Right Side,R,24+ ft.,2.27,6.445,23,4,1,59
433583,2021-12-09,Will Barton,Denver Nuggets,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.63,7.835,26,4,1,37


In [14]:
shots_df_grouped = shots_df_updated.groupby(["game_date", "player_name", "team_name"])
counts = shots_df_grouped.size()
(counts == 1).all()

False

In [15]:
shots_df_normalized = shots_df[['game_date', 'player_name', 'event_type', 'shot_made', 'action_type', 'shot_type', 
                                'basic_zone', 'zone_name', 'zone_abb', 'zone_range', 
                                'loc_x', 'loc_y', 'shot_distance', 'quarter', 'mins_left', 'secs_left']]

In [16]:
shots_df_normalized

Unnamed: 0,game_date,player_name,event_type,shot_made,action_type,shot_type,basic_zone,zone_name,zone_abb,zone_range,loc_x,loc_y,shot_distance,quarter,mins_left,secs_left
0,2021-12-09,Anthony Davis,Made Shot,True,Dunk Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.21,5.925,2,4,2,21
1,2021-12-09,Desmond Bane,Missed Shot,False,Step Back Jump shot,3PT Field,Above the Break 3,Center,C,24+ ft.,0.11,8.395,26,4,2,27
2,2021-12-09,Will Barton,Missed Shot,False,Step Back Jump shot,3PT Field,Left Corner 3,Left Side,L,24+ ft.,-2.33,6.185,23,4,2,3
3,2021-12-09,Jared Butler,Made Shot,True,Driving Finger Roll Layup Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.10,5.855,1,4,0,30
4,2021-12-09,Dejounte Murray,Made Shot,True,Driving Floating Jump Shot,2PT Field,In The Paint (Non-RA),Left Side,L,8-16 ft.,-0.72,6.605,10,4,2,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433580,2021-12-09,Russell Westbrook,Missed Shot,False,Pullup Jump shot,3PT Field,Above the Break 3,Right Side Center,RC,24+ ft.,1.68,7.655,25,4,1,45
433581,2021-12-09,De'Anthony Melton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.34,7.945,25,4,1,55
433582,2021-12-09,Jaren Jackson Jr.,Missed Shot,False,Jump Shot,3PT Field,Right Corner 3,Right Side,R,24+ ft.,2.27,6.445,23,4,1,59
433583,2021-12-09,Will Barton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.63,7.835,26,4,1,37


In [17]:
#a player has taken multiple shots in the same game
shots_df_normalized_grouped = shots_df_normalized.groupby(["game_date", "player_name"])
counts = shots_df_normalized_grouped.size()
(counts == 1).all()

False

In [18]:
#add shots_id
shots_df_normalized.loc[:, "shot_id"] = range(1, len(shots_df_normalized) + 1)
shots_df_normalized

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df_normalized.loc[:, "shot_id"] = range(1, len(shots_df_normalized) + 1)


Unnamed: 0,game_date,player_name,event_type,shot_made,action_type,shot_type,basic_zone,zone_name,zone_abb,zone_range,loc_x,loc_y,shot_distance,quarter,mins_left,secs_left,shot_id
0,2021-12-09,Anthony Davis,Made Shot,True,Dunk Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.21,5.925,2,4,2,21,1
1,2021-12-09,Desmond Bane,Missed Shot,False,Step Back Jump shot,3PT Field,Above the Break 3,Center,C,24+ ft.,0.11,8.395,26,4,2,27,2
2,2021-12-09,Will Barton,Missed Shot,False,Step Back Jump shot,3PT Field,Left Corner 3,Left Side,L,24+ ft.,-2.33,6.185,23,4,2,3,3
3,2021-12-09,Jared Butler,Made Shot,True,Driving Finger Roll Layup Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.10,5.855,1,4,0,30,4
4,2021-12-09,Dejounte Murray,Made Shot,True,Driving Floating Jump Shot,2PT Field,In The Paint (Non-RA),Left Side,L,8-16 ft.,-0.72,6.605,10,4,2,24,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433580,2021-12-09,Russell Westbrook,Missed Shot,False,Pullup Jump shot,3PT Field,Above the Break 3,Right Side Center,RC,24+ ft.,1.68,7.655,25,4,1,45,433581
433581,2021-12-09,De'Anthony Melton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.34,7.945,25,4,1,55,433582
433582,2021-12-09,Jaren Jackson Jr.,Missed Shot,False,Jump Shot,3PT Field,Right Corner 3,Right Side,R,24+ ft.,2.27,6.445,23,4,1,59,433583
433583,2021-12-09,Will Barton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.63,7.835,26,4,1,37,433584


In [19]:
shots_df_normalized = shots_df_normalized[['shot_id', 'game_date', 'player_name', 'event_type', 'shot_made', 'action_type', 'shot_type', 
                                'basic_zone', 'zone_name', 'zone_abb', 'zone_range', 
                                'loc_x', 'loc_y', 'shot_distance', 'quarter', 'mins_left', 'secs_left']]
shots_df_normalized

Unnamed: 0,shot_id,game_date,player_name,event_type,shot_made,action_type,shot_type,basic_zone,zone_name,zone_abb,zone_range,loc_x,loc_y,shot_distance,quarter,mins_left,secs_left
0,1,2021-12-09,Anthony Davis,Made Shot,True,Dunk Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.21,5.925,2,4,2,21
1,2,2021-12-09,Desmond Bane,Missed Shot,False,Step Back Jump shot,3PT Field,Above the Break 3,Center,C,24+ ft.,0.11,8.395,26,4,2,27
2,3,2021-12-09,Will Barton,Missed Shot,False,Step Back Jump shot,3PT Field,Left Corner 3,Left Side,L,24+ ft.,-2.33,6.185,23,4,2,3
3,4,2021-12-09,Jared Butler,Made Shot,True,Driving Finger Roll Layup Shot,2PT Field,Restricted Area,Center,C,Less Than 8 ft.,0.10,5.855,1,4,0,30
4,5,2021-12-09,Dejounte Murray,Made Shot,True,Driving Floating Jump Shot,2PT Field,In The Paint (Non-RA),Left Side,L,8-16 ft.,-0.72,6.605,10,4,2,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433580,433581,2021-12-09,Russell Westbrook,Missed Shot,False,Pullup Jump shot,3PT Field,Above the Break 3,Right Side Center,RC,24+ ft.,1.68,7.655,25,4,1,45
433581,433582,2021-12-09,De'Anthony Melton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.34,7.945,25,4,1,55
433582,433583,2021-12-09,Jaren Jackson Jr.,Missed Shot,False,Jump Shot,3PT Field,Right Corner 3,Right Side,R,24+ ft.,2.27,6.445,23,4,1,59
433583,433584,2021-12-09,Will Barton,Missed Shot,False,Jump Shot,3PT Field,Above the Break 3,Left Side Center,LC,24+ ft.,-1.63,7.835,26,4,1,37


In [20]:
player_stats = pd.read_csv("player_stats.csv")
player_stats

Unnamed: 0,game_id,player,team,FG,FGA,3P,3PA,FT,FTA,ORB,DRB,AST,STL,BLK,PTS
0,131410290001,Arron Afflalo,ORL,3.0,14.0,1.0,5.0,2.0,3.0,1.0,2.0,1.0,0.0,0.0,9.0
1,131410290001,Nikola Vučević,ORL,4.0,11.0,0.0,0.0,0.0,0.0,5.0,5.0,3.0,2.0,1.0,8.0
2,131410290001,Jameer Nelson,ORL,4.0,13.0,3.0,7.0,1.0,1.0,1.0,4.0,7.0,2.0,0.0,12.0
3,131410290001,Jason Maxiell,ORL,0.0,5.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,1.0,2.0,0.0
4,131410290001,Maurice Harkless,ORL,6.0,13.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305727,222304091230,John Butler,POR,1.0,5.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0
305728,222304091230,Chance Comanche,POR,3.0,5.0,0.0,0.0,1.0,4.0,2.0,1.0,0.0,0.0,1.0,7.0
305729,222304091230,Jabari Walker,POR,4.0,6.0,0.0,2.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,9.0
305730,222304091230,Drew Eubanks,POR,,,,,,,,,,,,


In [21]:
#check if player_name matches
player_stats_names = player_stats["player"].unique()
shots_names = shots_df_normalized["player_name"].unique()
names = set(player_stats_names.tolist() + shots_names.tolist())
names

{'Kenneth Faried',
 'Julyan Stone',
 'Chauncey Billups',
 'Gian Clavell',
 'RJ Barrett',
 'Willy Hernangómez',
 'Danilo Gallinari',
 'Andre Dawkins',
 'Viacheslav Kravtsov',
 'Maalik Wayns',
 'Corey Kispert',
 'Ryan Anderson',
 'Quenton Jackson',
 'Andre Drummond',
 'Brandon Jennings',
 'Andre Ingram',
 'Donte Grantham',
 'Stanley Johnson',
 'Marshall Plumlee',
 'Mario Chalmers',
 'Malik Monk',
 'Shake Milton',
 'Wayne Ellington',
 'Jaylen Brown',
 'Talen Horton-Tucker',
 'Nate Hinton',
 'Timothe Luwawu-Cabarrot',
 'Vic Law',
 'Cole Anthony',
 'Jaden McDaniels',
 'Jaron Blossomgame',
 'Dairis Bertāns',
 "De'Andre Hunter",
 'Darren Collison',
 'Kendall Brown',
 'A.J. Green',
 'Charlie Villanueva',
 'Thomas Robinson',
 'Avery Bradley',
 'Patty Mills',
 'Brad Wanamaker',
 "Jermaine O'Neal",
 'Facundo Campazzo',
 'Paul Reed',
 'Ray Allen',
 'Zoran Dragić',
 'John Collins',
 'Trevelin Queen',
 'Hollis Thompson',
 'Abdel Nader',
 'Mo Bamba',
 'Bradley Beal',
 'Terrence Ross',
 'Kyle Guy',
 '

In [22]:
print(shots_df_normalized["player_name"])

0             Anthony Davis
1              Desmond Bane
2               Will Barton
3              Jared Butler
4           Dejounte Murray
                ...        
433580    Russell Westbrook
433581    De'Anthony Melton
433582    Jaren Jackson Jr.
433583          Will Barton
433584      Dejounte Murray
Name: player_name, Length: 433585, dtype: object


In [None]:
#shots_df_normalized.to_csv("shots_normalized.csv", index=False)