### Obtaining player data per season

In [20]:
import pandas as pd
import os
pd.options.display.max_colwidth = 100

In [266]:
### All these files were obtained through the 'Basketball Reference' website 
### in links like this: https://www.basketball-reference.com/leagues/NBA_2018_totals.html
### These databases contains total stats for each player through each team on each of those seasons

file_list = ['season_17_18.csv',
             'season_18_19.csv',
             'season_19_20.csv',
             'season_20_21.csv',
             'season_21_22.csv']

In [267]:
df_dict = {}

In [268]:
for file in file_list:
    actual_df =  pd.read_csv(file)
    actual_df['Player Name'] = actual_df['Player'].str.split(pat = "\\", expand = True)[0]
    actual_df['Player Link'] = 'https://www.basketball-reference.com/players/'+ actual_df['Player'].str.split(pat = "\\", expand = True)[1].str[0] +'/' + actual_df['Player'].str.split(pat = "\\", expand = True)[1]  + '.html'
    actual_df['Season'] = file.split('.')[0]
    df_dict[file.split('.')[0]] = actual_df

In [269]:
df = pd.concat(df_dict, ignore_index = True)

In [270]:
df.shape

(2744, 33)

In [271]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2744 entries, 0 to 2743
Data columns (total 33 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rk           2744 non-null   int64  
 1   Player       2744 non-null   object 
 2   Pos          2744 non-null   object 
 3   Age          2744 non-null   int64  
 4   Tm           2744 non-null   object 
 5   G            2744 non-null   int64  
 6   GS           2744 non-null   int64  
 7   MP           2744 non-null   float64
 8   FG           2744 non-null   float64
 9   FGA          2744 non-null   float64
 10  FG%          2725 non-null   float64
 11  3P           2744 non-null   float64
 12  3PA          2744 non-null   float64
 13  3P%          2569 non-null   float64
 14  2P           2744 non-null   float64
 15  2PA          2744 non-null   float64
 16  2P%          2697 non-null   float64
 17  eFG%         2725 non-null   float64
 18  FT           2744 non-null   float64
 19  FTA   

In [272]:
df.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,Player Name,Player Link,Season
2739,601,Thaddeus Young\youngth01,PF,33,TOT,52,1,16.3,2.7,5.2,...,4.0,2.0,1.0,0.3,1.0,1.6,6.2,Thaddeus Young,https://www.basketball-reference.com/players/y/youngth01.html,season_21_22
2740,602,Trae Young\youngtr01,PG,23,ATL,76,76,34.9,9.4,20.3,...,3.7,9.7,0.9,0.1,4.0,1.7,28.4,Trae Young,https://www.basketball-reference.com/players/y/youngtr01.html,season_21_22
2741,603,Omer Yurtseven\yurtsom01,C,23,MIA,56,12,12.6,2.3,4.4,...,5.3,0.9,0.3,0.4,0.7,1.5,5.3,Omer Yurtseven,https://www.basketball-reference.com/players/y/yurtsom01.html,season_21_22
2742,604,Cody Zeller\zelleco01,C,29,POR,27,0,13.1,1.9,3.3,...,4.6,0.8,0.3,0.2,0.7,2.1,5.2,Cody Zeller,https://www.basketball-reference.com/players/z/zelleco01.html,season_21_22
2743,605,Ivica Zubac\zubaciv01,C,24,LAC,76,76,24.4,4.1,6.5,...,8.5,1.6,0.5,1.0,1.5,2.7,10.3,Ivica Zubac,https://www.basketball-reference.com/players/z/zubaciv01.html,season_21_22


In [273]:
df['Season'].value_counts()

season_21_22    605
season_20_21    540
season_17_18    540
season_18_19    530
season_19_20    529
Name: Season, dtype: int64

In [274]:
df.to_csv('players_stats_by_season.csv', index = False)

### Obtaining more player info
Here we are going to use a webscrapping library to gather data

In [29]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [250]:
height_list = []
weight_list = []
experience_list = []
birth_date_list = []
player_link_list = []

In [None]:
for player_link in df['Player Link'].unique():
    html = urlopen(player_link)
    bs = BeautifulSoup(html, 'html.parser')
    
    height_list.append(int(bs.find(attrs={"itemprop":"height"}).find_parent().contents[3].string.split('(')[1].split('c')[0]))
    weight_list.append(int(bs.find(attrs={"itemprop":"height"}).find_parent().contents[3].string.split(',\xa0')[1].split('k')[0]))
    experience_list.append(bs.find(attrs={"itemprop":"height"}).parent.find_next_siblings("p")[-1].contents[-1].split('\xa0')[1].split('\n')[0])
    birth_date_list.append(bs.find('span',attrs={"itemprop":"birthDate"})['data-birth'])
    
    player_link_list.append(player_link)

In [275]:
df_player_info = pd.DataFrame(data = {'Player Link':player_link_list,
                                      'Weight in kg':weight_list,
                                      'Height in cm':height_list,
                                      'Birth Date':birth_date_list,
                                      'Experience':experience_list})

In [276]:
df_player_info = df_player_info.merge(df[['Player Name', 'Player Link']].drop_duplicates(), on = 'Player Link')

In [277]:
df_player_info.to_csv('player_info.csv', index = False)