 # NBA Data: Pre-processing and Training Data Development

## Imports

In [2]:
import pickle
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
nba_df = pickle.load(open("data_clean/nba_df2.pkl", "rb")).drop(columns = "game_id")
nba_df

Unnamed: 0,fga,fg_pct,fg2a,fg2_pct,fg3a,fg3_pct,fta,ft_pct,reb,ast,stl,blk,to,pf,win
0,76.0,0.447368,68.0,0.470588,8.0,0.250000,30.0,0.500000,38.0,20.0,9.0,4.0,18.0,34.0,loss
1,70.0,0.457143,63.0,0.492063,7.0,0.142857,34.0,0.735294,41.0,23.0,8.0,4.0,18.0,26.0,win
2,81.0,0.493827,66.0,0.545455,15.0,0.266667,34.0,0.617647,48.0,25.0,18.0,7.0,25.0,35.0,win
3,75.0,0.426667,62.0,0.483871,13.0,0.153846,40.0,0.700000,43.0,20.0,9.0,4.0,24.0,26.0,loss
4,77.0,0.493506,71.0,0.478873,6.0,0.666667,29.0,0.689655,52.0,25.0,10.0,7.0,25.0,33.0,win
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51332,97.0,0.350515,75.0,0.346667,22.0,0.363636,24.0,0.833333,55.0,20.0,5.0,7.0,8.0,22.0,loss
51333,100.0,0.410000,59.0,0.440678,41.0,0.365854,21.0,0.857143,40.0,30.0,9.0,4.0,14.0,19.0,loss
51334,93.0,0.580645,65.0,0.615385,28.0,0.500000,26.0,0.769231,52.0,34.0,7.0,9.0,15.0,20.0,win
51335,94.0,0.446809,60.0,0.500000,34.0,0.352941,20.0,0.800000,48.0,30.0,7.0,7.0,21.0,23.0,loss


## Dummy Variables
All independent variables in this dataset are numerical so there is no need for dummy variables

## Training & Test Set Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(nba_df.drop(columns = "win"), nba_df.win, 
                                                    test_size = 0.3, random_state = 610)

## Standard Scaler

In [5]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

Creating dummy features (no categorical variables)
Scale standardization
Split data into training and testing subsets


In [6]:
X_train, X_test, y_train, y_test = train_test_split(nba_df.drop(columns = "win"), nba_df.win, 
                                                    test_size = 0.3, random_state = 610)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

## Checking Train/Test Split

In [8]:
X_train_scaled.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35929 entries, 0 to 35928
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   fga      35929 non-null  float64
 1   fg_pct   35929 non-null  float64
 2   fg2a     35929 non-null  float64
 3   fg2_pct  35929 non-null  float64
 4   fg3a     35929 non-null  float64
 5   fg3_pct  35929 non-null  float64
 6   fta      35929 non-null  float64
 7   ft_pct   35929 non-null  float64
 8   reb      35929 non-null  float64
 9   ast      35929 non-null  float64
 10  stl      35929 non-null  float64
 11  blk      35929 non-null  float64
 12  to       35929 non-null  float64
 13  pf       35929 non-null  float64
dtypes: float64(14)
memory usage: 3.8 MB


In [10]:
y_train.info(verbose = True)

<class 'pandas.core.series.Series'>
Int64Index: 35929 entries, 44118 to 28938
Series name: win
Non-Null Count  Dtype 
--------------  ----- 
35929 non-null  object
dtypes: object(1)
memory usage: 561.4+ KB


In [11]:
X_test_scaled.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15399 entries, 0 to 15398
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   fga      15399 non-null  float64
 1   fg_pct   15399 non-null  float64
 2   fg2a     15399 non-null  float64
 3   fg2_pct  15399 non-null  float64
 4   fg3a     15399 non-null  float64
 5   fg3_pct  15399 non-null  float64
 6   fta      15399 non-null  float64
 7   ft_pct   15399 non-null  float64
 8   reb      15399 non-null  float64
 9   ast      15399 non-null  float64
 10  stl      15399 non-null  float64
 11  blk      15399 non-null  float64
 12  to       15399 non-null  float64
 13  pf       15399 non-null  float64
dtypes: float64(14)
memory usage: 1.6 MB


In [12]:
y_test.info(verbose = True)

<class 'pandas.core.series.Series'>
Int64Index: 15399 entries, 33047 to 17857
Series name: win
Non-Null Count  Dtype 
--------------  ----- 
15399 non-null  object
dtypes: object(1)
memory usage: 240.6+ KB


## Save Data

In [13]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

save_object(nba_df, 'data_clean/nba_df3.pkl')
save_object(X_train_scaled, 'data_clean/X_train_scaled.pkl')
save_object(X_test_scaled, 'data_clean/X_test_scaled.pkl')
save_object(y_train, 'data_clean/y_train.pkl')
save_object(y_test, 'data_clean/y_test.pkl')