In [69]:
import lime
import lime.lime_tabular
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree, model_selection, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [74]:
# data = pd.read_csv('./match_features_winning_rate.csv', index_col = 0)
data = pd.read_csv('./match_features.csv', index_col = 0)
data.columns

Index(['MP', 'FG_AVG', 'FGA_AVG', 'FG%_AVG', '2P_AVG', '2PA_AVG', '2P%_AVG',
       '3P_AVG', '3PA_AVG', '3P%_AVG', 'FT_AVG', 'FT%_AVG', 'PTS_AVG',
       'FG.1_AVG', 'FGA.1_AVG', 'FG%.1_AVG', '2P.1_AVG', '2PA.1_AVG',
       '2P%.1_AVG', '3P.1_AVG', '3PA.1_AVG', '3P%.1_AVG', 'FT.1_AVG',
       'FTA.1_AVG', 'FT%.1_AVG', 'PTS.1_AVG', 'FG_players_AVG',
       'FGA_players_AVG', 'FG%_players_AVG', '3P_players_AVG',
       '3PA_players_AVG', '3P%_players_AVG', 'FT_players_AVG',
       'FT%_players_AVG', 'ORB_players_AVG', 'DRB_players_AVG',
       'TRB_players_AVG', 'AST_players_AVG', 'STL_players_AVG',
       'BLK_players_AVG', 'TOV_players_AVG', 'PF_players_AVG',
       'PTS_players_AVG', '+/-_players_AVG', 'TS%_players_AVG',
       '3PAr_players_AVG', 'FTr_players_AVG', 'ORB%_players_AVG',
       'DRB%_players_AVG', 'TRB%_players_AVG', 'AST%_players_AVG',
       'STL%_players_AVG', 'BLK%_players_AVG', 'TOV%_players_AVG',
       'USG%_players_AVG', 'ORtg_players_AVG', 'DRtg_players_AVG',


In [75]:
data.head()

Unnamed: 0,MP,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,...,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG,y
0,0,-1.4,2.6,-0.0302,0.4,5.4,-0.0426,-1.8,-2.8,-0.0316,...,0.22196,-0.546554,2.443134,0.623933,-5.701625,-2.037104,-1.159788,-0.315775,0.976896,1
1,0,-4.1,-14.2,0.0274,-1.7,-6.1,0.0344,-2.4,-8.1,0.0152,...,-0.168279,0.533393,1.842403,-0.002582,4.486208,-4.874819,-0.06736,-0.451256,-3.010764,0
2,0,2.4,2.0,0.015,1.7,2.6,-0.0015,0.7,-0.6,0.0269,...,-0.227475,0.346944,0.75761,-0.010248,-0.189826,6.96484,-0.561669,-0.132743,0.644868,1
3,0,2.6,-1.2,0.0361,2.6,0.7,0.0498,0.0,-1.9,0.019,...,0.189513,0.999818,1.69374,0.003797,1.964042,-1.389299,0.341002,-0.400132,1.348361,0
4,0,0.4,-0.5,0.0062,4.4,4.8,0.0378,-4.0,-5.3,-0.0557,...,0.503607,0.028347,0.556315,-0.212838,-9.751847,-13.232417,-0.539127,0.054787,-1.667778,1


In [76]:
data = data.drop(columns = ['MP'])
data = data.dropna()

In [77]:
data.shape

(1142, 60)

# Removing Outlier

In [78]:
from scipy import stats

def remove_outliers(df,column_name):
    upper = df[column_name].mean() + 3*df[column_name].std()
    lower = df[column_name].mean() - 3*df[column_name].std()
    new_df= df[(df[column_name]<upper) & (df[column_name]>lower)]
    return new_df

for i in range(data.shape[1]):
    data = remove_outliers(data, data.columns.tolist()[i])


In [79]:
data.shape

(1001, 60)

# Train Test Split

In [80]:
X = data.drop(columns = ['y'])
y = data['y']
X_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3244)

In [81]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((900, 59), (101, 59), (900,), (101,))

In [82]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
56,-5.5,0.5,-0.0671,-7.4,-10.1,-0.0403,1.9,10.6,-0.0599,-4.6,...,2.389872,-0.311436,0.00286,0.232605,0.004141,-7.428417,-2.686049,-1.113646,-0.034056,-1.809924
519,0.7,-4.0,0.0294,3.6,3.9,0.0299,-2.9,-7.9,0.0098,-3.5,...,0.97465,0.029874,0.09212,0.112836,0.204319,1.121125,6.402181,0.954692,1.35173,1.565042
998,2.1,-7.1,0.0578,6.3,4.5,0.0756,-4.2,-11.6,-0.0032,0.6,...,-0.203658,0.501103,0.073335,1.81961,0.000308,3.923389,-3.440438,0.532124,0.141075,0.130514
476,2.0,8.1,-0.0192,2.8,7.7,-0.0253,-0.8,0.4,-0.0363,-1.1,...,-1.718513,-0.016908,-0.269538,-2.447746,0.004055,-4.302951,-14.184042,1.134337,1.025123,-0.0995
431,4.4,4.6,0.0254,3.1,0.1,0.0563,1.3,4.5,-0.0162,-3.7,...,0.310408,-0.220351,-0.589992,0.668314,0.20689,2.912007,2.165868,0.014653,0.195291,2.901785


In [83]:
X_train.shape

(900, 59)

# Data Transformation

In [84]:
# standardize the coninuous features 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in range(X_train.shape[1]):
    X_train.iloc[:, i] = sc.fit_transform(X_train.iloc[:, i].values.reshape(-1,1))
    X_test.iloc[:, i] = sc.transform(X_test.iloc[:, i].values.reshape(-1,1))

X_train = X_train.dropna(axis = 0)
X_test = X_test.dropna(axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a

In [85]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
56,-1.891069,0.081278,-2.255763,-2.15606,-1.836361,-0.975921,0.873825,2.359208,-1.609723,-1.458927,...,1.661613,-0.823898,0.143143,0.129171,0.23215,-1.039378,-0.341198,-1.399847,-0.055684,-1.027366
519,0.279622,-0.969734,1.095767,1.136854,0.70261,0.90996,-1.455081,-1.807898,0.201037,-1.112762,...,0.685045,0.382595,0.321257,0.074467,1.085768,0.288132,1.01922,1.296738,2.736298,0.989043
998,0.769779,-1.693765,2.082124,1.945114,0.811423,2.137662,-2.085826,-2.641319,-0.136695,0.177488,...,-0.128041,2.048339,0.283773,0.854032,0.215807,0.723248,-0.454123,0.745817,0.297158,0.131969
476,0.734767,1.856321,-0.592154,0.897369,1.391759,-0.572955,-0.436185,0.061668,-0.996611,-0.357494,...,-1.173361,0.217228,-0.400411,-1.095074,0.231783,-0.554079,-2.062334,1.530949,2.078275,-0.005455
431,1.575035,1.038867,0.956843,0.987176,0.013461,1.619179,0.582712,0.985189,-0.474426,-1.175701,...,0.226687,-0.501921,-1.039857,0.32818,1.096734,0.566208,0.385086,0.071167,0.406389,1.787694


In [86]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((900, 59), (101, 59), (900,), (101,))

# Feature Selection

In [87]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_feature_selection = pd.DataFrame(columns = ['Columns Number', 'Accuracy'])

for i in range(1,int(X_train.shape[1]/5)):
    selector = SelectKBest(f_classif, k=i*5)
    X_train_processed = selector.fit_transform(X_train, y_train)
    X_test_processed = selector.transform(X_test)

    classifier = RandomForestClassifier(n_jobs=2, random_state=0)

    classifier.fit(X_train_processed, y_train)
    y_pred = classifier.predict(X_test_processed)
    
    df_feature_selection.loc[i] = [5*i, str(accuracy_score(y_test, y_pred))]

In [88]:
df_feature_selection

Unnamed: 0,Columns Number,Accuracy
1,5,0.495049504950495
2,10,0.5247524752475248
3,15,0.5148514851485149
4,20,0.5346534653465347
5,25,0.4752475247524752
6,30,0.5445544554455446
7,35,0.5544554455445545
8,40,0.5544554455445545
9,45,0.5643564356435643
10,50,0.5346534653465347


In [89]:
best = df_feature_selection[df_feature_selection['Accuracy'] 
                            == df_feature_selection.Accuracy.max()]['Columns Number'].tolist()
best = int(best[0])

In [90]:
selector = SelectKBest(f_classif, k=best)
selector.fit(X_train, y_train)

SelectKBest(k=45)

In [91]:
cols = selector.get_support(indices=True)
X_train = X_train.iloc[:,cols]
X_test = X_test.iloc[:,cols]

In [92]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((900, 45), (101, 45), (900,), (101,))

# Write out CSV files

In [45]:
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

In [46]:
df_train.to_csv('./df_train_standarized_45_columns.csv')
df_test.to_csv('./df_test_standarized_45_columns.csv')