## This notebook cleans and transforms the dataset

In [1]:
import pandas as pd
from unidecode import unidecode
from tqdm import tqdm
import numpy as np
pd.options.display.max_colwidth = 150
pd.set_option('display.max_columns', None)

# Players

In [9]:
players = pd.read_csv("players.csv")
players

Unnamed: 0,Name,Link,From,To,Position,Height,Weight,Birth
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2021,2024,F,6-8,225,"September 19, 1999"
1,Steven Adams,https://www.basketball-reference.com/players/a/adamsst01.html,2014,2023,C,6-11,265,"July 20, 1993"
2,Bam Adebayo,https://www.basketball-reference.com/players/a/adebaba01.html,2018,2024,C-F,6-9,255,"July 18, 1997"
3,Ochai Agbaji,https://www.basketball-reference.com/players/a/agbajoc01.html,2023,2024,G,6-5,215,"April 20, 2000"
4,Santi Aldama,https://www.basketball-reference.com/players/a/aldamsa01.html,2022,2024,F,6-11,224,"January 10, 2001"
...,...,...,...,...,...,...,...,...
782,Thaddeus Young,https://www.basketball-reference.com/players/y/youngth01.html,2008,2024,F,6-8,235,"June 21, 1988"
783,Trae Young,https://www.basketball-reference.com/players/y/youngtr01.html,2019,2024,G,6-1,164,"September 19, 1998"
784,Omer Yurtseven,https://www.basketball-reference.com/players/y/yurtsom01.html,2022,2024,C,7-0,264,"June 19, 1998"
785,Cody Zeller,https://www.basketball-reference.com/players/z/zelleco01.html,2014,2024,C-F,6-11,240,"October 5, 1992"


In [10]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 787 entries, 0 to 786
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      787 non-null    object
 1   Link      787 non-null    object
 2   From      787 non-null    int64 
 3   To        787 non-null    int64 
 4   Position  787 non-null    object
 5   Height    787 non-null    object
 6   Weight    787 non-null    int64 
 7   Birth     787 non-null    object
dtypes: int64(3), object(5)
memory usage: 49.3+ KB


### Change names with national characters to unicode

In [11]:
pd.unique(players["Name"])

array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

In [12]:
names = []
for ind in tqdm(players.index):
    names.append(unidecode(players["Name"][ind]))
players["Name"] = names
pd.unique(players["Name"])

100%|█████████████████████████████████████████████████████████████████████████████| 787/787 [00:00<00:00, 68368.87it/s]


array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

In [13]:
players.to_csv("players_trans.csv", index=False)

# Gamelogs

In [14]:
gamelogs = pd.read_csv("gamelogs.csv")
gamelogs

Unnamed: 0,Name,Link,Game,Date,Match Link,Age,Team,Location,Opponent,Result,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,1,2023-10-25,https://www.basketball-reference.com/boxscores/202310250TOR.html,24-036,TOR,,MIN,W (+3),0,24:17,4,9,0.444,0,1,0.000,0,0,,3,5,8,0,0,0,2,1,8,4.5,-5.0
1,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2,2023-10-27,https://www.basketball-reference.com/boxscores/202310270CHI.html,24-038,TOR,@,CHI,L (-1),0,22:38,2,5,0.400,1,3,0.333,3,3,1.000,2,7,9,1,0,0,3,6,8,4.1,-22.0
2,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,3,2023-10-28,https://www.basketball-reference.com/boxscores/202310280TOR.html,24-039,TOR,,PHI,L (-7),0,18:15,3,10,0.300,0,1,0.000,0,0,,3,4,7,3,0,1,1,1,6,4.9,5.0
3,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,4,2023-11-11,https://www.basketball-reference.com/boxscores/202311110BOS.html,24-053,TOR,@,BOS,L (-23),0,18:31,1,5,0.200,0,2,0.000,0,0,,0,5,5,5,0,0,0,1,2,3.5,-12.0
4,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,5,2023-11-13,https://www.basketball-reference.com/boxscores/202311130TOR.html,24-055,TOR,,WAS,W (+4),0,19:16,5,8,0.625,0,1,0.000,0,0,,1,3,4,1,1,0,2,1,10,7.3,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86528,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,1,2023-04-16,https://www.basketball-reference.com/boxscores/202304160PHO.html,26-029,LAC,@,PHO,W (+5),1,29:43,5,8,0.625,0,0,,2,4,0.500,4,11,15,0,0,0,3,3,12,9.5,-11.0
86529,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2,2023-04-18,https://www.basketball-reference.com/boxscores/202304180PHO.html,26-031,LAC,@,PHO,L (-14),1,29:29,2,7,0.286,0,0,,4,6,0.667,4,2,6,0,2,0,3,1,8,5.1,-14.0
86530,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,3,2023-04-20,https://www.basketball-reference.com/boxscores/202304200LAC.html,26-033,LAC,,PHO,L (-5),1,22:20,3,4,0.750,0,0,,0,0,,1,7,8,0,0,0,2,1,6,4.8,-5.0
86531,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,4,2023-04-22,https://www.basketball-reference.com/boxscores/202304220LAC.html,26-035,LAC,,PHO,L (-12),1,24:25,2,5,0.400,0,0,,0,0,,3,6,9,1,1,0,1,1,4,5.5,-8.0


In [15]:
gamelogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86533 entries, 0 to 86532
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        86533 non-null  object 
 1   Link        86533 non-null  object 
 2   Game        86533 non-null  int64  
 3   Date        86533 non-null  object 
 4   Match Link  86533 non-null  object 
 5   Age         86533 non-null  object 
 6   Team        86533 non-null  object 
 7   Location    43182 non-null  object 
 8   Opponent    86533 non-null  object 
 9   Result      86533 non-null  object 
 10  GS          86533 non-null  int64  
 11  MP          86533 non-null  object 
 12  FG          86533 non-null  int64  
 13  FGA         86533 non-null  int64  
 14  FG%         81719 non-null  float64
 15  3P          86533 non-null  int64  
 16  3PA         86533 non-null  int64  
 17  3P%         67146 non-null  float64
 18  FT          86533 non-null  int64  
 19  FTA         86533 non-nul

### Change names with national characters to unicode

In [16]:
pd.unique(gamelogs["Name"])

array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

In [17]:
names = []
for ind in tqdm(gamelogs.index):
    names.append(unidecode(gamelogs["Name"][ind]))
gamelogs["Name"] = names
pd.unique(gamelogs["Name"])

100%|████████████████████████████████████████████████████████████████████████| 86533/86533 [00:00<00:00, 100411.70it/s]


array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

### Replace values in Location column

In [18]:
gamelogs["Location"] = gamelogs["Location"].replace(np.NaN, "Road")
gamelogs["Location"] = gamelogs["Location"].replace("@", "Home")
gamelogs["Location"].value_counts()

Location
Road    43351
Home    43182
Name: count, dtype: int64

In [19]:
gamelogs

Unnamed: 0,Name,Link,Game,Date,Match Link,Age,Team,Location,Opponent,Result,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,1,2023-10-25,https://www.basketball-reference.com/boxscores/202310250TOR.html,24-036,TOR,Road,MIN,W (+3),0,24:17,4,9,0.444,0,1,0.000,0,0,,3,5,8,0,0,0,2,1,8,4.5,-5.0
1,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2,2023-10-27,https://www.basketball-reference.com/boxscores/202310270CHI.html,24-038,TOR,Home,CHI,L (-1),0,22:38,2,5,0.400,1,3,0.333,3,3,1.000,2,7,9,1,0,0,3,6,8,4.1,-22.0
2,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,3,2023-10-28,https://www.basketball-reference.com/boxscores/202310280TOR.html,24-039,TOR,Road,PHI,L (-7),0,18:15,3,10,0.300,0,1,0.000,0,0,,3,4,7,3,0,1,1,1,6,4.9,5.0
3,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,4,2023-11-11,https://www.basketball-reference.com/boxscores/202311110BOS.html,24-053,TOR,Home,BOS,L (-23),0,18:31,1,5,0.200,0,2,0.000,0,0,,0,5,5,5,0,0,0,1,2,3.5,-12.0
4,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,5,2023-11-13,https://www.basketball-reference.com/boxscores/202311130TOR.html,24-055,TOR,Road,WAS,W (+4),0,19:16,5,8,0.625,0,1,0.000,0,0,,1,3,4,1,1,0,2,1,10,7.3,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86528,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,1,2023-04-16,https://www.basketball-reference.com/boxscores/202304160PHO.html,26-029,LAC,Home,PHO,W (+5),1,29:43,5,8,0.625,0,0,,2,4,0.500,4,11,15,0,0,0,3,3,12,9.5,-11.0
86529,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2,2023-04-18,https://www.basketball-reference.com/boxscores/202304180PHO.html,26-031,LAC,Home,PHO,L (-14),1,29:29,2,7,0.286,0,0,,4,6,0.667,4,2,6,0,2,0,3,1,8,5.1,-14.0
86530,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,3,2023-04-20,https://www.basketball-reference.com/boxscores/202304200LAC.html,26-033,LAC,Road,PHO,L (-5),1,22:20,3,4,0.750,0,0,,0,0,,1,7,8,0,0,0,2,1,6,4.8,-5.0
86531,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,4,2023-04-22,https://www.basketball-reference.com/boxscores/202304220LAC.html,26-035,LAC,Road,PHO,L (-12),1,24:25,2,5,0.400,0,0,,0,0,,3,6,9,1,1,0,1,1,4,5.5,-8.0


### Extract score Difference from Result column

In [20]:
gamelogs['Diff'] = gamelogs['Result'].str.extract(r'\((.*?)\)', expand=False)
gamelogs['Result'] = gamelogs['Result'].replace(r'\((.*?)\)', "", regex=True)
gamelogs['Result'] = gamelogs['Result'].str.strip()

In [21]:
gamelogs

Unnamed: 0,Name,Link,Game,Date,Match Link,Age,Team,Location,Opponent,Result,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,Diff
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,1,2023-10-25,https://www.basketball-reference.com/boxscores/202310250TOR.html,24-036,TOR,Road,MIN,W,0,24:17,4,9,0.444,0,1,0.000,0,0,,3,5,8,0,0,0,2,1,8,4.5,-5.0,+3
1,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2,2023-10-27,https://www.basketball-reference.com/boxscores/202310270CHI.html,24-038,TOR,Home,CHI,L,0,22:38,2,5,0.400,1,3,0.333,3,3,1.000,2,7,9,1,0,0,3,6,8,4.1,-22.0,-1
2,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,3,2023-10-28,https://www.basketball-reference.com/boxscores/202310280TOR.html,24-039,TOR,Road,PHI,L,0,18:15,3,10,0.300,0,1,0.000,0,0,,3,4,7,3,0,1,1,1,6,4.9,5.0,-7
3,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,4,2023-11-11,https://www.basketball-reference.com/boxscores/202311110BOS.html,24-053,TOR,Home,BOS,L,0,18:31,1,5,0.200,0,2,0.000,0,0,,0,5,5,5,0,0,0,1,2,3.5,-12.0,-23
4,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,5,2023-11-13,https://www.basketball-reference.com/boxscores/202311130TOR.html,24-055,TOR,Road,WAS,W,0,19:16,5,8,0.625,0,1,0.000,0,0,,1,3,4,1,1,0,2,1,10,7.3,8.0,+4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86528,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,1,2023-04-16,https://www.basketball-reference.com/boxscores/202304160PHO.html,26-029,LAC,Home,PHO,W,1,29:43,5,8,0.625,0,0,,2,4,0.500,4,11,15,0,0,0,3,3,12,9.5,-11.0,+5
86529,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2,2023-04-18,https://www.basketball-reference.com/boxscores/202304180PHO.html,26-031,LAC,Home,PHO,L,1,29:29,2,7,0.286,0,0,,4,6,0.667,4,2,6,0,2,0,3,1,8,5.1,-14.0,-14
86530,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,3,2023-04-20,https://www.basketball-reference.com/boxscores/202304200LAC.html,26-033,LAC,Road,PHO,L,1,22:20,3,4,0.750,0,0,,0,0,,1,7,8,0,0,0,2,1,6,4.8,-5.0,-5
86531,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,4,2023-04-22,https://www.basketball-reference.com/boxscores/202304220LAC.html,26-035,LAC,Road,PHO,L,1,24:25,2,5,0.400,0,0,,0,0,,3,6,9,1,1,0,1,1,4,5.5,-8.0,-12


In [22]:
gamelogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86533 entries, 0 to 86532
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        86533 non-null  object 
 1   Link        86533 non-null  object 
 2   Game        86533 non-null  int64  
 3   Date        86533 non-null  object 
 4   Match Link  86533 non-null  object 
 5   Age         86533 non-null  object 
 6   Team        86533 non-null  object 
 7   Location    86533 non-null  object 
 8   Opponent    86533 non-null  object 
 9   Result      86533 non-null  object 
 10  GS          86533 non-null  int64  
 11  MP          86533 non-null  object 
 12  FG          86533 non-null  int64  
 13  FGA         86533 non-null  int64  
 14  FG%         81719 non-null  float64
 15  3P          86533 non-null  int64  
 16  3PA         86533 non-null  int64  
 17  3P%         67146 non-null  float64
 18  FT          86533 non-null  int64  
 19  FTA         86533 non-nul

### Replace missing values for percentage columns with 0

In [23]:
gamelogs[["FG%", "3P%", "FT%"]] = gamelogs[["FG%", "3P%", "FT%"]].fillna(0)

In [24]:
gamelogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86533 entries, 0 to 86532
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        86533 non-null  object 
 1   Link        86533 non-null  object 
 2   Game        86533 non-null  int64  
 3   Date        86533 non-null  object 
 4   Match Link  86533 non-null  object 
 5   Age         86533 non-null  object 
 6   Team        86533 non-null  object 
 7   Location    86533 non-null  object 
 8   Opponent    86533 non-null  object 
 9   Result      86533 non-null  object 
 10  GS          86533 non-null  int64  
 11  MP          86533 non-null  object 
 12  FG          86533 non-null  int64  
 13  FGA         86533 non-null  int64  
 14  FG%         86533 non-null  float64
 15  3P          86533 non-null  int64  
 16  3PA         86533 non-null  int64  
 17  3P%         86533 non-null  float64
 18  FT          86533 non-null  int64  
 19  FTA         86533 non-nul

In [25]:
gamelogs.to_csv("gamelogs_trans.csv", index=False)

# Splits

In [26]:
splits = pd.read_csv("splits.csv")
splits

Unnamed: 0,Name,Link,Year,Split,Value,G,GS,MP,FG,FGA,3P,3PA,FT,FTA,ORB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,TS%,USG%,ORtg,DRtg,+/-
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,,Total,74,18,1624,235,469,26,97,69,112,191,487,97,46,68,83,143,565,0.501,0.268,0.616,0.545,16.2,113,112,-1.5
1,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,Place,Home,44,13,1038,156,314,14,63,44,68,121,310,57,29,50,49,80,370,0.497,0.222,0.647,0.538,16.5,113,111,-0.9
2,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,,Road,30,5,585,79,155,12,34,25,44,70,177,40,17,18,34,63,195,0.510,0.353,0.568,0.559,15.7,113,112,-2.4
3,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,All-Star,Pre,48,9,1020,158,322,19,71,41,64,124,303,65,29,36,50,88,376,0.491,0.268,0.641,0.537,16.9,113,114,-0.2
4,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,,Post,26,9,604,77,147,7,26,28,48,67,184,32,17,32,33,55,189,0.524,0.269,0.583,0.562,15.0,112,108,-3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104835,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,,Sacramento,4,4,104,23,31,0,0,8,10,11,36,5,2,3,3,10,54,0.742,,0.800,0.763,15.7,149,103,4.3
104836,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,,San Antonio,3,3,79,11,21,0,0,4,6,14,34,4,2,3,3,7,26,0.524,,0.667,0.550,14.3,120,98,-13.1
104837,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,,Toronto,1,1,17,1,4,0,0,0,0,1,7,2,1,0,3,1,2,0.250,,,0.250,20.0,53,105,-40.6
104838,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,,Utah,3,3,56,7,17,0,0,7,7,9,18,1,3,2,2,9,21,0.412,,1.000,0.523,17.7,117,121,-44.7


In [27]:
splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104840 entries, 0 to 104839
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Name    104840 non-null  object 
 1   Link    104840 non-null  object 
 2   Year    104840 non-null  int64  
 3   Split   20438 non-null   object 
 4   Value   104840 non-null  object 
 5   G       104840 non-null  int64  
 6   GS      104840 non-null  int64  
 7   MP      104840 non-null  int64  
 8   FG      104840 non-null  int64  
 9   FGA     104840 non-null  int64  
 10  3P      104840 non-null  int64  
 11  3PA     104840 non-null  int64  
 12  FT      104840 non-null  int64  
 13  FTA     104840 non-null  int64  
 14  ORB     104840 non-null  int64  
 15  TRB     104840 non-null  int64  
 16  AST     104840 non-null  int64  
 17  STL     104840 non-null  int64  
 18  BLK     104840 non-null  int64  
 19  TOV     104840 non-null  int64  
 20  PF      104840 non-null  int64  
 21  PTS     10

### Change names with national characters to unicode

In [28]:
pd.unique(splits["Name"])

array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

In [29]:
names = []
for ind in tqdm(splits.index):
    names.append(unidecode(splits["Name"][ind]))
splits["Name"] = names
pd.unique(splits["Name"])

100%|███████████████████████████████████████████████████████████████████████| 104840/104840 [00:01<00:00, 90182.68it/s]


array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Ochai Agbaji',
       'Santi Aldama', 'Ty-Shon Alexander', 'Nickeil Alexander-Walker',
       'Rawle Alkins', 'Grayson Allen', 'Jarrett Allen', 'Timmy Allen',
       'Jose Alvarado', 'Justin Anderson', 'Kyle Anderson',
       'Giannis Antetokounmpo', 'Kostas Antetokounmpo',
       'Thanasis Antetokounmpo', 'Cole Anthony', 'OG Anunoby',
       'Ryan Arcidiacono', 'Deni Avdija', 'Joel Ayayi', 'Deandre Ayton',
       'Udoka Azubuike', 'Ibou Badji', 'Marvin Bagley III',
       'Amari Bailey', 'Patrick Baldwin Jr.', 'LaMelo Ball', 'Lonzo Ball',
       'Mo Bamba', 'Paolo Banchero', 'Desmond Bane', 'Dalano Banton',
       'Cat Barber', 'Dominick Barlow', 'Harrison Barnes',
       'Scottie Barnes', 'RJ Barrett', 'Paris Bass', 'Charles Bassey',
       'Emoni Bates', 'Keita Bates-Diop', 'Nicolas Batum',
       'Kent Bazemore', 'Darius Bazley', 'Bradley Beal', 'Malik Beasley',
       'MarJon Beauchamp', 'Jordan Bell', "DeAndre' Bembry",
 

### Fill missing values for Split

In [30]:
# Fill rows with Total data
splits.loc[splits["Value"] == "Total", "Split"] = "Total"

In [31]:
# Fill all other rows with forward fill
splits["Split"] = splits["Split"].fillna(method='ffill')

  splits["Split"] = splits["Split"].fillna(method='ffill')


In [32]:
splits

Unnamed: 0,Name,Link,Year,Split,Value,G,GS,MP,FG,FGA,3P,3PA,FT,FTA,ORB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,TS%,USG%,ORtg,DRtg,+/-
0,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,Total,Total,74,18,1624,235,469,26,97,69,112,191,487,97,46,68,83,143,565,0.501,0.268,0.616,0.545,16.2,113,112,-1.5
1,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,Place,Home,44,13,1038,156,314,14,63,44,68,121,310,57,29,50,49,80,370,0.497,0.222,0.647,0.538,16.5,113,111,-0.9
2,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,Place,Road,30,5,585,79,155,12,34,25,44,70,177,40,17,18,34,63,195,0.510,0.353,0.568,0.559,15.7,113,112,-2.4
3,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,All-Star,Pre,48,9,1020,158,322,19,71,41,64,124,303,65,29,36,50,88,376,0.491,0.268,0.641,0.537,16.9,113,114,-0.2
4,Precious Achiuwa,https://www.basketball-reference.com/players/a/achiupr01.html,2024,All-Star,Post,26,9,604,77,147,7,26,28,48,67,184,32,17,32,33,55,189,0.524,0.269,0.583,0.562,15.0,112,108,-3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104835,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,Opponent,Sacramento,4,4,104,23,31,0,0,8,10,11,36,5,2,3,3,10,54,0.742,,0.800,0.763,15.7,149,103,4.3
104836,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,Opponent,San Antonio,3,3,79,11,21,0,0,4,6,14,34,4,2,3,3,7,26,0.524,,0.667,0.550,14.3,120,98,-13.1
104837,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,Opponent,Toronto,1,1,17,1,4,0,0,0,0,1,7,2,1,0,3,1,2,0.250,,,0.250,20.0,53,105,-40.6
104838,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,2022,Opponent,Utah,3,3,56,7,17,0,0,7,7,9,18,1,3,2,2,9,21,0.412,,1.000,0.523,17.7,117,121,-44.7


### Replace missing values in some percentage columns

In [33]:
splits[["FG%", "3P%", "FT%", "TS%"]] = splits[["FG%", "3P%", "FT%", "TS%"]].fillna(0)

In [34]:
splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104840 entries, 0 to 104839
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Name    104840 non-null  object 
 1   Link    104840 non-null  object 
 2   Year    104840 non-null  int64  
 3   Split   104840 non-null  object 
 4   Value   104840 non-null  object 
 5   G       104840 non-null  int64  
 6   GS      104840 non-null  int64  
 7   MP      104840 non-null  int64  
 8   FG      104840 non-null  int64  
 9   FGA     104840 non-null  int64  
 10  3P      104840 non-null  int64  
 11  3PA     104840 non-null  int64  
 12  FT      104840 non-null  int64  
 13  FTA     104840 non-null  int64  
 14  ORB     104840 non-null  int64  
 15  TRB     104840 non-null  int64  
 16  AST     104840 non-null  int64  
 17  STL     104840 non-null  int64  
 18  BLK     104840 non-null  int64  
 19  TOV     104840 non-null  int64  
 20  PF      104840 non-null  int64  
 21  PTS     10

In [35]:
splits.to_csv("splits_trans.csv", index=False)

# Schedule

In [36]:
schedule = pd.read_csv("schedule.csv")
schedule

Unnamed: 0,Link,Date,Start,Visitor,Visitor Link,Visitor PTS,Home,Home Link,Home PTS,Match Link,Overtimes,Attendance,Game Length,Arena,Notes
0,https://www.basketball-reference.com/leagues/NBA_2024_games-october.html,"Tue, Oct 24, 2023",7:30p,Los Angeles Lakers,https://www.basketball-reference.com/teams/LAL/2024.html,107,Denver Nuggets,https://www.basketball-reference.com/teams/DEN/2024.html,119,https://www.basketball-reference.com/boxscores/202310240DEN.html,,19842,2:17,Ball Arena,
1,https://www.basketball-reference.com/leagues/NBA_2024_games-october.html,"Tue, Oct 24, 2023",10:00p,Phoenix Suns,https://www.basketball-reference.com/teams/PHO/2024.html,108,Golden State Warriors,https://www.basketball-reference.com/teams/GSW/2024.html,104,https://www.basketball-reference.com/boxscores/202310240GSW.html,,18064,2:33,Chase Center,
2,https://www.basketball-reference.com/leagues/NBA_2024_games-october.html,"Wed, Oct 25, 2023",7:00p,Houston Rockets,https://www.basketball-reference.com/teams/HOU/2024.html,86,Orlando Magic,https://www.basketball-reference.com/teams/ORL/2024.html,116,https://www.basketball-reference.com/boxscores/202310250ORL.html,,18846,2:14,Kia Center,
3,https://www.basketball-reference.com/leagues/NBA_2024_games-october.html,"Wed, Oct 25, 2023",7:00p,Boston Celtics,https://www.basketball-reference.com/teams/BOS/2024.html,108,New York Knicks,https://www.basketball-reference.com/teams/NYK/2024.html,104,https://www.basketball-reference.com/boxscores/202310250NYK.html,,19812,2:39,Madison Square Garden (IV),
4,https://www.basketball-reference.com/leagues/NBA_2024_games-october.html,"Wed, Oct 25, 2023",7:00p,Washington Wizards,https://www.basketball-reference.com/teams/WAS/2024.html,120,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,143,https://www.basketball-reference.com/boxscores/202310250IND.html,,16004,2:07,Gainbridge Fieldhouse,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314,https://www.basketball-reference.com/leagues/NBA_2024_games-june.html,"Thu, Jun 6, 2024",8:30p,Dallas Mavericks,https://www.basketball-reference.com/teams/DAL/2024.html,89,Boston Celtics,https://www.basketball-reference.com/teams/BOS/2024.html,107,https://www.basketball-reference.com/boxscores/202406060BOS.html,,19156,2:12,TD Garden,
1315,https://www.basketball-reference.com/leagues/NBA_2024_games-june.html,"Sun, Jun 9, 2024",8:00p,Dallas Mavericks,https://www.basketball-reference.com/teams/DAL/2024.html,98,Boston Celtics,https://www.basketball-reference.com/teams/BOS/2024.html,105,https://www.basketball-reference.com/boxscores/202406090BOS.html,,19156,2:18,TD Garden,
1316,https://www.basketball-reference.com/leagues/NBA_2024_games-june.html,"Wed, Jun 12, 2024",8:30p,Boston Celtics,https://www.basketball-reference.com/teams/BOS/2024.html,106,Dallas Mavericks,https://www.basketball-reference.com/teams/DAL/2024.html,99,https://www.basketball-reference.com/boxscores/202406120DAL.html,,20311,2:21,American Airlines Center,
1317,https://www.basketball-reference.com/leagues/NBA_2024_games-june.html,"Fri, Jun 14, 2024",8:30p,Boston Celtics,https://www.basketball-reference.com/teams/BOS/2024.html,84,Dallas Mavericks,https://www.basketball-reference.com/teams/DAL/2024.html,122,https://www.basketball-reference.com/boxscores/202406140DAL.html,,20277,2:22,American Airlines Center,


In [37]:
schedule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Link          1319 non-null   object
 1   Date          1319 non-null   object
 2   Start         1319 non-null   object
 3   Visitor       1319 non-null   object
 4   Visitor Link  1319 non-null   object
 5   Visitor PTS   1319 non-null   int64 
 6   Home          1319 non-null   object
 7   Home Link     1319 non-null   object
 8   Home PTS      1319 non-null   int64 
 9   Match Link    1319 non-null   object
 10  Overtimes     62 non-null     object
 11  Attendance    1318 non-null   object
 12  Game Length   1314 non-null   object
 13  Arena         1319 non-null   object
 14  Notes         75 non-null     object
dtypes: int64(2), object(13)
memory usage: 154.7+ KB


In [38]:
schedule["Match Link"].nunique()

1319

In [39]:
schedule.to_csv("schedule_trans.csv", index=False)

# Teams

Already clean

In [40]:
teams = pd.read_csv("teams.csv")
teams.to_csv("teams_trans.csv", index=False)

# Rosters

In [41]:
rosters = pd.read_csv("rosters.csv")
rosters

Unnamed: 0,Team,Team Link,Number,Player,Player Link,Position
0,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,11,Bruce Brown,https://www.basketball-reference.com/players/b/brownbr01.html,SG
1,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,10,Kendall Brown,https://www.basketball-reference.com/players/b/brownke03.html,SG
2,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,0,Tyrese Haliburton,https://www.basketball-reference.com/players/h/halibty01.html,PG
3,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,7,Buddy Hield,https://www.basketball-reference.com/players/h/hieldbu01.html,SF
4,Indiana Pacers,https://www.basketball-reference.com/teams/IND/2024.html,22,Isaiah Jackson,https://www.basketball-reference.com/players/j/jacksis01.html,C
...,...,...,...,...,...,...
652,Memphis Grizzlies,https://www.basketball-reference.com/teams/MEM/2024.html,2,Xavier Tillman Sr.,https://www.basketball-reference.com/players/t/tillmxa01.html,C
653,Memphis Grizzlies,https://www.basketball-reference.com/teams/MEM/2024.html,18,Yuta Watanabe,https://www.basketball-reference.com/players/w/watanyu01.html,SF
654,Memphis Grizzlies,https://www.basketball-reference.com/teams/MEM/2024.html,14,Jack White,https://www.basketball-reference.com/players/w/whiteja03.html,SF
655,Memphis Grizzlies,https://www.basketball-reference.com/teams/MEM/2024.html,5,Vince Williams Jr.,https://www.basketball-reference.com/players/w/willivi01.html,SG


### Change names with national characters to unicode

In [42]:
pd.unique(rosters["Player"])

array(['Bruce Brown', 'Kendall Brown', 'Tyrese Haliburton', 'Buddy Hield',
       'Isaiah Jackson', 'Quenton Jackson', 'James Johnson',
       'Bennedict Mathurin', 'T.J. McConnell', 'Doug McDermott',
       'Andrew Nembhard', 'Aaron Nesmith', 'Jordan Nwora', 'Ben Sheppard',
       'Pascal Siakam', 'Jalen Smith', 'Daniel Theis', 'Obi Toppin',
       'Oscar Tshiebwe', 'Myles Turner', 'Jarace Walker', 'Isaiah Wong',
       'Dalano Banton', 'Oshae Brissett', 'Jaylen Brown', 'JD Davison',
       'Sam Hauser', 'Jrue Holiday', 'Al Horford', 'Luke Kornet',
       'Svi Mykhailiuk', 'Drew Peterson', 'Kristaps PorziÅ\x86Ä£is',
       'Payton Pritchard', 'Neemias Queta', 'Jaden Springer',
       'Lamar Stevens', 'Jayson Tatum', 'Xavier Tillman Sr.',
       'Jordan Walsh', 'Derrick White', 'DÄ\x81vis BertÄ\x81ns',
       'Bismack Biyombo', 'Ousmane Dieng', 'Luguentz Dort',
       'Adam Flagler', 'Josh Giddey', 'Shai Gilgeous-Alexander',
       'Gordon Hayward', 'Chet Holmgren', 'Isaiah Joe',
     

In [43]:
names = []
for ind in tqdm(rosters.index):
    names.append(unidecode(rosters["Player"][ind]))
rosters["Player"] = names
pd.unique(rosters["Player"])

100%|█████████████████████████████████████████████████████████████████████████████| 657/657 [00:00<00:00, 68967.31it/s]


array(['Bruce Brown', 'Kendall Brown', 'Tyrese Haliburton', 'Buddy Hield',
       'Isaiah Jackson', 'Quenton Jackson', 'James Johnson',
       'Bennedict Mathurin', 'T.J. McConnell', 'Doug McDermott',
       'Andrew Nembhard', 'Aaron Nesmith', 'Jordan Nwora', 'Ben Sheppard',
       'Pascal Siakam', 'Jalen Smith', 'Daniel Theis', 'Obi Toppin',
       'Oscar Tshiebwe', 'Myles Turner', 'Jarace Walker', 'Isaiah Wong',
       'Dalano Banton', 'Oshae Brissett', 'Jaylen Brown', 'JD Davison',
       'Sam Hauser', 'Jrue Holiday', 'Al Horford', 'Luke Kornet',
       'Svi Mykhailiuk', 'Drew Peterson', 'Kristaps PorziAAPSis',
       'Payton Pritchard', 'Neemias Queta', 'Jaden Springer',
       'Lamar Stevens', 'Jayson Tatum', 'Xavier Tillman Sr.',
       'Jordan Walsh', 'Derrick White', 'DAvis BertAns',
       'Bismack Biyombo', 'Ousmane Dieng', 'Luguentz Dort',
       'Adam Flagler', 'Josh Giddey', 'Shai Gilgeous-Alexander',
       'Gordon Hayward', 'Chet Holmgren', 'Isaiah Joe',
       'Keyontae

#### Some players are present in multiple teams, it is not an issue for our use case

In [44]:
rosters["Player Link"].nunique()

572

#### Below we can see that we scraped data for all players on roster

In [45]:
array1 = pd.unique(rosters["Player Link"])
array2 = pd.unique(players["Link"])

# Find values in array1 that are not in array2
diff1 = np.setdiff1d(array1, array2)

print("Values in array1 but not in array2:")
print(diff1)

Values in array1 but not in array2:
[]


In [46]:
rosters.to_csv("rosters_trans.csv", index=False)