# Basketball Players EDA

## Michael Jordan vs Kobe Bryant vs Lebron James

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Collection

In [2]:
df_games = pd.read_csv("data/allgames_stats.csv")
df_games.head()

Unnamed: 0,G,Date,Tm,X,Opp,Result,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Player,RSorPO
0,1,1996-11-03,LAL,,MIN,W,0,6:00,0.0,1.0,...,1,0,0.0,1.0,1,1,0.0,-1.1,Kobe Bryant,Regular Season
1,2,1996-11-05,LAL,@,NYK,W,0,3:00,0.0,1.0,...,0,0,0.0,0.0,1,0,1.0,-1.1,Kobe Bryant,Regular Season
2,3,1996-11-06,LAL,@,CHH,L,0,7:00,2.0,3.0,...,0,0,0.0,0.0,3,0,5.0,0.7,Kobe Bryant,Regular Season
3,4,1996-11-08,LAL,@,TOR,L,0,17:00,3.0,8.0,...,3,0,0.0,1.0,0,3,10.0,5.2,Kobe Bryant,Regular Season
4,5,1996-11-10,LAL,,ATL,W,0,8:00,0.0,3.0,...,3,1,0.0,0.0,0,0,2.0,1.9,Kobe Bryant,Regular Season


In [3]:
df_salary = pd.read_csv("data/salaries.csv")
df_salary.head()

Unnamed: 0,Season,Team,Lg,Salary,Player
0,2003-04,Cleveland Cavaliers,NBA,$4018920,Lebron James
1,2004-05,Cleveland Cavaliers,NBA,$4320360,Lebron James
2,2005-06,Cleveland Cavaliers,NBA,$4621800,Lebron James
3,2006-07,Cleveland Cavaliers,NBA,$5828090,Lebron James
4,2007-08,Cleveland Cavaliers,NBA,$13041250,Lebron James


### 1.1. Data Description - TABLE 1
|Term|Meaning|Term|Meaning|
|----|:-----:|----|:-----:|
|Pos|Position|eFG%|Effective Field Goal Percentage|
|Tm|Team|Lg|League|
|G|Games|+/-|Plus/Minus|
|GS|Games Started|GmSc|Game Score|
|MP|Minutes Played Per Game|PTS|Points per Game|
|FG|Field Goals Per Game|PF|Personal Fouls Per Game|
|FGA|Field Goal Attempts Per Game|TOV|Turnovers Per Game|
|FG%|Field Goal Percentage|BLN|Blocks Per Game|
|3P|3-Point Field Goals Per Game|STL|Steals Per Game|
|3PA|3-Point Field Goal Attempts Per Game|AST|Assists Per Game|
|3P%|3-Point Field Goal Percentage|TRB|Total Rebounds Per Game|
|2P|2-Point Field Goals Per Game|DRB|Defensive Rebounds Per Game|Game|
|2PA|2-Point Field Goal Attempts Per Game|ORB|Offensive Rebounds Per 
|2P%|2-Point Field Goal Percentage|FT%|Free Throw Percentage|
|FT|Free Throws Per Game|FTA|Free Throw Attempts Per Game|

### 1.2. Data Description - TABLE 2
|Term|Meaning|Explanation|
|----|:-----:|:---------:|
|PER|Player Efficiency Rating|A measure of per-minute production standardized such that the league average is 15| 
|TS%|True Shooting Percentage|A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws|
|3PAr|3-Point Attempt Rate|Percentage of FG Attempts from 3-Point Range|
|FTr|Free Throw Attempt Rate|Number of FT Attempts Per FG Attempt|
|ORB%|Offensive Rebound Percentage|An estimate of the percentage of available offensive rebounds a player grabbed while he was on the floor|
|DRB%|Defensive Rebound Percentage|An estimate of the percentage of available defensive rebounds a player grabbed while he was on the floor|
|TRB%|Total Rebound Percentage|An estimate of the percentage of available rebounds a player grabbed while he was on the floor|
|AST%|Assist Percentage|An estimate of the percentage of teammate field goals a player assisted while he was on the floor|
|STL%|Steal Percentage|An estiamte of the percentage of opponent possessions that end with a steal by the player while he was on the floor|
|BLK%|Block Percentage|An estimate of the percentage of opponent two-point field goal attempts blocked by the player while he was on the floor|
|TOV%|Turnover Percentage|An estimate of turnovers committed per 100 plays|
|USG%|Usage Percentage|An estimate of the percentage of team plays used by a player while he was on the floor|
|OWS|Offensive Win Shares|An estiamte of the number of wins contributed by a player due to his offense|
|DWS|Defensive Win Shares|An estimate of the number of wins contributed by a player due to his defense|
|WS|Win Shares|An estiamte of the number of wins contributed by a player due to his defense|
|WS/48|Win Shares Per 48 Minutes|An estimate of the number of wins contributed by a player per 48 minutes(league average is approximately)
|OBPM|Offensive Box Plus/Minus|A box score estimate of the offensive points per 100 possessions a player contributed above a league-average player, translated to an average team|
|DBPM|Defensive Box Plus/Minus|A box score estimate of the defensive points per 100 possessions a player contributed above a league-average player, translated to an  average team|
|BPM|Box Plus/Minus|A box score estimate of the points per 100 possessions a player contributed above a league-average player, translated to an average team|
|VORP|Value Over Replacement Player|A box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement -level (-2,0) player, translated to an average team and pro-rated to an 82-game season|


# 2. Data Cleaning

## 2.1. Games Data

In [4]:
df_games.head()

Unnamed: 0,G,Date,Tm,X,Opp,Result,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Player,RSorPO
0,1,1996-11-03,LAL,,MIN,W,0,6:00,0.0,1.0,...,1,0,0.0,1.0,1,1,0.0,-1.1,Kobe Bryant,Regular Season
1,2,1996-11-05,LAL,@,NYK,W,0,3:00,0.0,1.0,...,0,0,0.0,0.0,1,0,1.0,-1.1,Kobe Bryant,Regular Season
2,3,1996-11-06,LAL,@,CHH,L,0,7:00,2.0,3.0,...,0,0,0.0,0.0,3,0,5.0,0.7,Kobe Bryant,Regular Season
3,4,1996-11-08,LAL,@,TOR,L,0,17:00,3.0,8.0,...,3,0,0.0,1.0,0,3,10.0,5.2,Kobe Bryant,Regular Season
4,5,1996-11-10,LAL,,ATL,W,0,8:00,0.0,3.0,...,3,1,0.0,0.0,0,0,2.0,1.9,Kobe Bryant,Regular Season


In [5]:
df_games.shape

(4254, 29)

In [6]:
# Drop "X" column
df_games.drop("X", axis=1, inplace=True)

In [7]:
df_games.head()

Unnamed: 0,G,Date,Tm,Opp,Result,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Player,RSorPO
0,1,1996-11-03,LAL,MIN,W,0,6:00,0.0,1.0,0.0,...,1,0,0.0,1.0,1,1,0.0,-1.1,Kobe Bryant,Regular Season
1,2,1996-11-05,LAL,NYK,W,0,3:00,0.0,1.0,0.0,...,0,0,0.0,0.0,1,0,1.0,-1.1,Kobe Bryant,Regular Season
2,3,1996-11-06,LAL,CHH,L,0,7:00,2.0,3.0,0.67,...,0,0,0.0,0.0,3,0,5.0,0.7,Kobe Bryant,Regular Season
3,4,1996-11-08,LAL,TOR,L,0,17:00,3.0,8.0,0.38,...,3,0,0.0,1.0,0,3,10.0,5.2,Kobe Bryant,Regular Season
4,5,1996-11-10,LAL,ATL,W,0,8:00,0.0,3.0,0.0,...,3,1,0.0,0.0,0,0,2.0,1.9,Kobe Bryant,Regular Season


In [8]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4254 entries, 0 to 4253
Data columns (total 28 columns):
G         4254 non-null int64
Date      4254 non-null object
Tm        4254 non-null object
Opp       4254 non-null object
Result    4254 non-null object
GS        4254 non-null int64
MP        4254 non-null object
FG        4250 non-null float64
FGA       4247 non-null float64
FG%       4237 non-null float64
3P        4254 non-null int64
3PA       4254 non-null int64
3P%       3750 non-null float64
FT        4253 non-null float64
FTA       4252 non-null float64
FT%       4119 non-null float64
ORB       4253 non-null float64
DRB       4252 non-null float64
TRB       4254 non-null int64
AST       4254 non-null int64
STL       4243 non-null float64
BLK       4253 non-null float64
TOV       4254 non-null int64
PF        4254 non-null int64
PTS       4245 non-null float64
GmSc      4035 non-null float64
Player    4254 non-null object
RSorPO    4254 non-null object
dtypes: float64(13),

In [9]:
df_games.describe()

Unnamed: 0,G,GS,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc
count,4254.0,4254.0,4250.0,4247.0,4237.0,4254.0,4254.0,3750.0,4253.0,4252.0,...,4253.0,4252.0,4254.0,4254.0,4243.0,4253.0,4254.0,4254.0,4245.0,4035.0
mean,33.771039,0.952515,9.956,20.701907,0.478806,1.162435,3.478138,0.298587,6.402304,7.998589,...,1.29391,5.04445,6.345087,5.724024,1.745934,0.701152,3.092619,2.377292,27.444287,20.548178
std,23.250115,0.212698,3.801338,6.387853,0.121048,1.371055,2.67098,0.276708,3.89438,4.524951,...,1.277438,2.742909,3.12764,3.019336,1.40217,0.916192,1.859863,1.41267,9.671222,8.789507
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.9
25%,13.0,1.0,7.0,17.0,0.4,0.0,1.0,0.0,4.0,5.0,...,0.0,3.0,4.0,4.0,1.0,0.0,2.0,1.0,21.0,14.6
50%,31.0,1.0,10.0,21.0,0.48,1.0,3.0,0.29,6.0,8.0,...,1.0,5.0,6.0,5.0,2.0,0.0,3.0,2.0,27.0,20.9
75%,53.0,1.0,12.0,25.0,0.56,2.0,5.0,0.5,9.0,11.0,...,2.0,7.0,8.0,8.0,3.0,1.0,4.0,3.0,34.0,26.4
max,82.0,1.0,28.0,50.0,1.0,12.0,21.0,1.0,24.0,28.0,...,9.0,17.0,19.0,19.0,7.0,5.0,11.0,6.0,81.0,63.5


In [10]:
df_games.isna().sum()

G           0
Date        0
Tm          0
Opp         0
Result      0
GS          0
MP          0
FG          4
FGA         7
FG%        17
3P          0
3PA         0
3P%       504
FT          1
FTA         2
FT%       135
ORB         1
DRB         2
TRB         0
AST         0
STL        11
BLK         1
TOV         0
PF          0
PTS         9
GmSc      219
Player      0
RSorPO      0
dtype: int64

In [11]:
# fill null rows in "FG" ,"FGA", "FG%" columns with the mean
mean_value = df_games[["FG", "FGA", "FG%"]].mean()
df_games[["FG", "FGA", "FG%"]] = df_games[["FG", "FGA", "FG%"]].fillna(mean_value)

In [12]:
# Drop all nulls
df_games.dropna(inplace=True)

In [13]:
df_games.isna().sum()

G         0
Date      0
Tm        0
Opp       0
Result    0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
GmSc      0
Player    0
RSorPO    0
dtype: int64

In [14]:
df_games.shape

(3478, 28)

In [15]:
# Checking for duplicates
duplicates_games = df_games[df_games.duplicated()]
duplicates_games.shape

(0, 28)

## 2.2. Salary Data

In [16]:
df_salary.head()

Unnamed: 0,Season,Team,Lg,Salary,Player
0,2003-04,Cleveland Cavaliers,NBA,$4018920,Lebron James
1,2004-05,Cleveland Cavaliers,NBA,$4320360,Lebron James
2,2005-06,Cleveland Cavaliers,NBA,$4621800,Lebron James
3,2006-07,Cleveland Cavaliers,NBA,$5828090,Lebron James
4,2007-08,Cleveland Cavaliers,NBA,$13041250,Lebron James


In [17]:
df_games.shape

(3478, 28)

In [18]:
df_salary.rename(columns={"Lg":"League"}, inplace=True)
df_salary.head()

Unnamed: 0,Season,Team,League,Salary,Player
0,2003-04,Cleveland Cavaliers,NBA,$4018920,Lebron James
1,2004-05,Cleveland Cavaliers,NBA,$4320360,Lebron James
2,2005-06,Cleveland Cavaliers,NBA,$4621800,Lebron James
3,2006-07,Cleveland Cavaliers,NBA,$5828090,Lebron James
4,2007-08,Cleveland Cavaliers,NBA,$13041250,Lebron James


In [19]:
df_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
Season    50 non-null object
Team      50 non-null object
League    50 non-null object
Salary    50 non-null object
Player    50 non-null object
dtypes: object(5)
memory usage: 2.1+ KB


In [20]:
df_salary.isna().sum()

Season    0
Team      0
League    0
Salary    0
Player    0
dtype: int64

In [21]:
df_salary.shape

(50, 5)

In [22]:
# Checking for duplicates 
duplicate_salary = df_salary[df_salary.duplicated()]
duplicate_salary.shape

(0, 5)

# 3. Exploratory Data Analysis

## 3.1. Games EDA

In [23]:
df_games.head()

Unnamed: 0,G,Date,Tm,Opp,Result,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Player,RSorPO
3,4,1996-11-08,LAL,TOR,L,0,17:00,3.0,8.0,0.38,...,3,0,0.0,1.0,0,3,10.0,5.2,Kobe Bryant,Regular Season
6,7,1996-11-13,LAL,SAS,L,0,6:00,2.0,2.0,1.0,...,0,0,1.0,0.0,0,2,6.0,5.6,Kobe Bryant,Regular Season
7,8,1996-11-15,LAL,LAC,W,0,7:00,1.0,4.0,0.25,...,0,0,0.0,0.0,1,4,4.0,-1.4,Kobe Bryant,Regular Season
8,9,1996-11-17,LAL,PHO,W,0,14:00,5.0,8.0,0.62,...,2,2,0.0,0.0,4,2,16.0,10.4,Kobe Bryant,Regular Season
9,10,1996-11-19,LAL,GSW,W,0,8:00,0.0,2.0,0.0,...,2,1,0.0,0.0,1,2,3.0,0.7,Kobe Bryant,Regular Season


In [24]:
df_games.columns

Index(['G', 'Date', 'Tm', 'Opp', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'Player', 'RSorPO'],
      dtype='object')

## 3.1.1. Michael Jordan

![image](images/mjordan.jpg)

In [31]:
df_jordan = df_games[(df_games['Player'].isin(['Michael Jordan']))]
df_jordan.head()

Unnamed: 0,G,Date,Tm,Opp,Result,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Player,RSorPO
3012,10,1984-11-15,CHI,BOS,L,1,33:00,12.0,24.0,0.5,...,2,2,2.0,1.0,1,4,27.0,17.1,Michael Jordan,Regular Season
3014,12,1984-11-19,CHI,IND,L,1,39:00,11.0,26.0,0.42,...,5,2,2.0,1.0,3,3,34.0,20.8,Michael Jordan,Regular Season
3017,15,1984-11-24,CHI,POR,L,1,41:00,10.0,24.0,0.42,...,6,8,3.0,1.0,4,4,30.0,23.9,Michael Jordan,Regular Season
3023,21,1984-12-07,CHI,NYK,W,1,43:00,8.0,16.0,0.5,...,2,3,2.0,0.0,6,5,21.0,9.3,Michael Jordan,Regular Season
3025,23,1984-12-11,CHI,DET,L,1,37:00,13.0,28.0,0.46,...,8,6,2.0,0.0,3,4,27.0,16.2,Michael Jordan,Regular Season
