In [1]:
import lime
import lime.lime_tabular
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree, model_selection, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
data = pd.read_csv('./match_features.csv', index_col = 0)
data.columns

Index(['MP', 'FG_AVG', 'FGA_AVG', 'FG%_AVG', '2P_AVG', '2PA_AVG', '2P%_AVG',
       '3P_AVG', '3PA_AVG', '3P%_AVG', 'FT_AVG', 'FT%_AVG', 'PTS_AVG',
       'FG.1_AVG', 'FGA.1_AVG', 'FG%.1_AVG', '2P.1_AVG', '2PA.1_AVG',
       '2P%.1_AVG', '3P.1_AVG', '3PA.1_AVG', '3P%.1_AVG', 'FT.1_AVG',
       'FTA.1_AVG', 'FT%.1_AVG', 'PTS.1_AVG', 'FG_players_AVG',
       'FGA_players_AVG', 'FG%_players_AVG', '3P_players_AVG',
       '3PA_players_AVG', '3P%_players_AVG', 'FT_players_AVG',
       'FT%_players_AVG', 'ORB_players_AVG', 'DRB_players_AVG',
       'TRB_players_AVG', 'AST_players_AVG', 'STL_players_AVG',
       'BLK_players_AVG', 'TOV_players_AVG', 'PF_players_AVG',
       'PTS_players_AVG', '+/-_players_AVG', 'TS%_players_AVG',
       '3PAr_players_AVG', 'FTr_players_AVG', 'ORB%_players_AVG',
       'DRB%_players_AVG', 'TRB%_players_AVG', 'AST%_players_AVG',
       'STL%_players_AVG', 'BLK%_players_AVG', 'TOV%_players_AVG',
       'USG%_players_AVG', 'ORtg_players_AVG', 'DRtg_players_AVG',


In [3]:
data.head()

Unnamed: 0,MP,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,...,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG,y
0,0,-1.4,2.6,-0.0302,0.4,5.4,-0.0426,-1.8,-2.8,-0.0316,...,0.22196,-0.546554,2.443134,0.623933,-5.701625,-2.037104,-1.159788,-0.315775,0.976896,1
1,0,-4.1,-14.2,0.0274,-1.7,-6.1,0.0344,-2.4,-8.1,0.0152,...,-0.168279,0.533393,1.842403,-0.002582,4.486208,-4.874819,-0.06736,-0.451256,-3.010764,0
2,0,2.4,2.0,0.015,1.7,2.6,-0.0015,0.7,-0.6,0.0269,...,-0.227475,0.346944,0.75761,-0.010248,-0.189826,6.96484,-0.561669,-0.132743,0.644868,1
3,0,2.6,-1.2,0.0361,2.6,0.7,0.0498,0.0,-1.9,0.019,...,0.189513,0.999818,1.69374,0.003797,1.964042,-1.389299,0.341002,-0.400132,1.348361,0
4,0,0.4,-0.5,0.0062,4.4,4.8,0.0378,-4.0,-5.3,-0.0557,...,0.503607,0.028347,0.556315,-0.212838,-9.751847,-13.232417,-0.539127,0.054787,-1.667778,1


In [4]:
data = data.drop(columns = ['MP'])
data = data.dropna()

In [5]:
data.shape

(1142, 60)

# Train Test Split

In [17]:
X = data.drop(columns = ['y'])
y = data['y']
X_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3244)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1027, 59), (115, 59), (1027,), (115,))

In [19]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
156,3.4,0.7,0.0336,-2.7,-5.9,0.0102,6.1,6.6,0.1038,-2.0,...,1.633675,-0.347577,0.142305,-1.477675,-0.206649,12.25093,8.610076,0.805773,0.693425,0.959576
1134,-1.1,0.2,-0.0128,-4.1,-8.3,0.0122,3.0,8.5,0.0034,0.5,...,1.38652,0.258996,-0.411682,4.012339,-0.005721,-4.933778,-14.632028,-0.066092,-0.683379,1.061486
14,2.4,9.6,-0.0199,4.4,8.3,0.0055,-2.0,1.3,-0.0638,-5.2,...,-0.386368,0.382041,-0.311137,-0.208688,-0.003684,-12.700208,-5.862722,-0.234228,0.253155,0.224417
739,-2.4,0.2,-0.0263,0.0,1.7,-0.0115,-2.4,-1.5,-0.0454,1.8,...,1.636186,0.160231,0.777394,1.941467,-0.206055,-9.111118,-5.467618,-0.244894,-0.162206,-1.051111
594,1.2,-2.0,0.0246,0.3,-5.0,0.058,0.9,3.0,0.0093,-2.5,...,1.078017,-0.379631,-1.397233,-2.005774,-0.002311,4.265979,8.713083,-0.842485,0.338746,-0.35866


In [20]:
X_train.shape

(1027, 59)

# Data Transformation

In [21]:
# standardize the coninuous features 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in range(X_train.shape[1]):
    X_train.iloc[:, i] = sc.fit_transform(X_train.iloc[:, i].values.reshape(-1,1))
    X_test.iloc[:, i] = sc.transform(X_test.iloc[:, i].values.reshape(-1,1))

X_train = X_train.dropna(axis = 0)
X_test = X_test.dropna(axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a

In [22]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
156,1.192552,0.119019,1.192984,-0.755245,-1.073703,0.392449,2.728638,1.41919,2.520668,-0.585396,...,1.10695,-0.980213,0.435013,-0.601659,-0.601235,1.89521,1.32605,1.033717,1.206123,0.613029
1134,-0.354415,0.007228,-0.368262,-1.17296,-1.500876,0.4459,1.31891,1.829939,0.011193,0.163821,...,0.939996,1.170986,-0.62569,1.765759,0.188589,-0.632111,-2.064436,-0.051313,-1.224926,0.671
14,0.848781,2.108907,-0.60716,1.363166,1.453736,0.266839,-0.954844,0.273418,-1.668456,-1.544394,...,-0.257598,1.607363,-0.433179,-0.054443,0.196595,-1.774304,-0.785197,-0.260556,0.428729,0.19483
739,-0.801316,0.007228,-0.822504,0.050348,0.279011,-0.187495,-1.136744,-0.331895,-1.208552,0.553414,...,1.108647,0.820717,1.651002,0.872753,-0.598899,-1.246464,-0.72756,-0.27383,-0.304681,-0.530759
594,0.436257,-0.484655,0.890156,0.139858,-0.913514,1.669931,0.363934,0.64093,0.158662,-0.735239,...,0.731601,-1.09389,-2.512698,-0.829387,0.201992,0.720879,1.341076,-1.017527,0.579861,-0.136855


# Feature Selection

In [23]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_feature_selection = pd.DataFrame(columns = ['Columns Number', 'Accuracy'])

for i in range(1,int(X_train.shape[1]/5)):
    selector = SelectKBest(f_classif, k=i*5)
    X_train_processed = selector.fit_transform(X_train, y_train)
    X_test_processed = selector.transform(X_test)

    classifier = RandomForestClassifier(n_jobs=2, random_state=0)

    classifier.fit(X_train_processed, y_train)
    y_pred = classifier.predict(X_test_processed)
    
    df_feature_selection.loc[i] = [5*i, str(accuracy_score(y_test, y_pred))]

In [24]:
df_feature_selection

Unnamed: 0,Columns Number,Accuracy
1,5,0.5391304347826087
2,10,0.5217391304347826
3,15,0.5391304347826087
4,20,0.4869565217391304
5,25,0.5304347826086957
6,30,0.4956521739130435
7,35,0.4869565217391304
8,40,0.4869565217391304
9,45,0.5739130434782609
10,50,0.5130434782608696


In [25]:
best = df_feature_selection[df_feature_selection['Accuracy'] 
                            == df_feature_selection.Accuracy.max()]['Columns Number'].tolist()
best = int(best[0])

In [26]:
selector = SelectKBest(f_classif, k=best)
selector.fit(X_train, y_train)

SelectKBest(k=45)

In [27]:
cols = selector.get_support(indices=True)
X_train = X_train.iloc[:,cols]
X_test = X_test.iloc[:,cols]

In [28]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1027, 45), (115, 45), (1027,), (115,))

# Write out CSV files

In [29]:
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

In [23]:
df_train.to_csv('./df_train_standardized_45_columns.csv')
df_test.to_csv('./df_test_standardized_45_columns.csv')