In [1]:
import pandas as pd

player_data = pd.read_csv("datasets/player_data.csv")
players = pd.read_csv("datasets/Players.csv")
season_stats = pd.read_csv("datasets/Seasons_Stats.csv")

Looked at `player_data`. All the information can be found in `players` or `season_stats`.

In `players`, the only relevant column is college. The other columns contain information in `season_stats` or could possibly cause data leakage.

I renamed `collage` to `college` within the `players` database.

In [2]:
players = pd.DataFrame.rename(players, columns = {"collage":"college"})

In [4]:
players = players.drop(labels = ["height", "born", "birth_city", "birth_state"],axis = 1)

I merged `players` data into `season_stats` based on player name.

In [3]:
season_stats = pd.merge(season_stats, players, on = "Player", how = "left")

In [5]:
list(season_stats)

['Unnamed: 0_x',
 'Year',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 'blanl',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'blank2',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'Unnamed: 0_y',
 'height',
 'weight',
 'college',
 'born',
 'birth_city',
 'birth_state']

Drop useless indices.

In [6]:
season_stats = season_stats.drop(labels = ["Unnamed: 0_x", "Unnamed: 0_y"], axis = 1)

Check for high cardinality variables

In [7]:
high_cardinal_vars = [categ_vars for categ_vars in season_stats.columns.values if season_stats[categ_vars].dtype == "object"]
high_cardinal_vars

['Player', 'Pos', 'Tm', 'college', 'birth_city', 'birth_state']

In [8]:
season_stats = season_stats.drop(labels = ["college"], axis = 1)

We want to keep the player name to see who it is!

In [9]:
pd.set_option('display.max_columns', None)  
season_stats.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,height,weight,born,birth_city,birth_state
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0,180.0,77.0,1918.0,,
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0,188.0,83.0,1921.0,Yorktown,Indiana
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0,193.0,86.0,1924.0,,
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0,196.0,88.0,1925.0,,
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0,196.0,88.0,1925.0,,


In [13]:
season_stats.to_csv("season_stats_merged.csv", columns=["Year","Pos","Age","height","weight","TS%"])