In [1]:
import lime
import lime.lime_tabular
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree, model_selection, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
# data = pd.read_csv('../data/match_features_win.csv', index_col = 0)
data = pd.read_csv('../data/match_features.csv', index_col = 0)
data.columns

Index(['MP', 'FG_AVG', 'FGA_AVG', 'FG%_AVG', '2P_AVG', '2PA_AVG', '2P%_AVG',
       '3P_AVG', '3PA_AVG', '3P%_AVG', 'FT_AVG', 'FT%_AVG', 'PTS_AVG',
       'FG.1_AVG', 'FGA.1_AVG', 'FG%.1_AVG', '2P.1_AVG', '2PA.1_AVG',
       '2P%.1_AVG', '3P.1_AVG', '3PA.1_AVG', '3P%.1_AVG', 'FT.1_AVG',
       'FTA.1_AVG', 'FT%.1_AVG', 'PTS.1_AVG', 'FG_players_AVG',
       'FGA_players_AVG', 'FG%_players_AVG', '3P_players_AVG',
       '3PA_players_AVG', '3P%_players_AVG', 'FT_players_AVG',
       'FT%_players_AVG', 'ORB_players_AVG', 'DRB_players_AVG',
       'TRB_players_AVG', 'AST_players_AVG', 'STL_players_AVG',
       'BLK_players_AVG', 'TOV_players_AVG', 'PF_players_AVG',
       'PTS_players_AVG', '+/-_players_AVG', 'TS%_players_AVG',
       '3PAr_players_AVG', 'FTr_players_AVG', 'ORB%_players_AVG',
       'DRB%_players_AVG', 'TRB%_players_AVG', 'AST%_players_AVG',
       'STL%_players_AVG', 'BLK%_players_AVG', 'TOV%_players_AVG',
       'USG%_players_AVG', 'ORtg_players_AVG', 'DRtg_players_AVG',


In [3]:
data.head()

Unnamed: 0,MP,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,...,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG,y
0,0,-1.4,2.6,-0.0302,0.4,5.4,-0.0426,-1.8,-2.8,-0.0316,...,0.22196,-0.546554,2.443134,0.623933,-5.701625,-2.037104,-1.159788,-0.315775,0.976896,1
1,0,-4.1,-14.2,0.0274,-1.7,-6.1,0.0344,-2.4,-8.1,0.0152,...,-0.168279,0.533393,1.842403,-0.002582,4.486208,-4.874819,-0.06736,-0.451256,-3.010764,0
2,0,2.4,2.0,0.015,1.7,2.6,-0.0015,0.7,-0.6,0.0269,...,-0.227475,0.346944,0.75761,-0.010248,-0.189826,6.96484,-0.561669,-0.132743,0.644868,1
3,0,2.6,-1.2,0.0361,2.6,0.7,0.0498,0.0,-1.9,0.019,...,0.189513,0.999818,1.69374,0.003797,1.964042,-1.389299,0.341002,-0.400132,1.348361,0
4,0,0.4,-0.5,0.0062,4.4,4.8,0.0378,-4.0,-5.3,-0.0557,...,0.503607,0.028347,0.556315,-0.212838,-9.751847,-13.232417,-0.539127,0.054787,-1.667778,1


In [4]:
data = data.drop(columns = ['MP'])
data = data.dropna()

In [5]:
data.shape

(1142, 60)

# Removing Outlier

In [6]:
from scipy import stats

def remove_outliers(df,column_name):
    upper = df[column_name].mean() + 3*df[column_name].std()
    lower = df[column_name].mean() - 3*df[column_name].std()
    new_df= df[(df[column_name]<upper) & (df[column_name]>lower)]
    return new_df

for i in range(data.shape[1]):
    data = remove_outliers(data, data.columns.tolist()[i])


In [7]:
data.shape

(1001, 60)

# Train Test Split

In [8]:
X = data.drop(columns = ['y'])
y = data['y']
X_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 59), (201, 59), (800,), (201,))

In [10]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
919,-4.7,-6.8,-0.02,-8.9,-16.0,-0.0147,4.2,9.2,0.0303,5.1,...,2.430715,-0.113888,-0.62035,4.384247,-0.203234,-0.292201,2.035861,0.140996,-0.819015,-0.175958
598,-1.9,2.9,-0.0361,0.3,7.5,-0.0684,-2.2,-4.6,-0.0128,3.6,...,-1.920106,-0.250308,0.047081,-2.355621,-0.398514,-10.843264,-3.917035,0.077499,-0.114248,-2.379104
1069,-0.5,3.1,-0.0202,-4.4,-3.9,-0.0422,3.9,7.0,0.0318,0.2,...,0.198642,-0.297794,0.270507,1.135591,0.009067,-6.694389,-8.831611,0.302174,-0.283942,0.717292
855,0.3,4.8,-0.0206,2.5,5.9,-0.0134,-2.2,-1.1,-0.0467,-0.3,...,-2.196693,-0.531835,0.161914,0.605864,-0.411165,-9.800951,0.681312,-0.051518,-0.398359,0.388583
1131,-1.6,-0.6,-0.0137,-1.7,-5.2,0.0219,0.1,4.6,-0.0463,5.2,...,0.124578,0.119547,-0.731026,2.332434,0.212445,-3.58191,-10.512507,0.43666,-0.60128,1.017479


In [11]:
X_train.shape

(800, 59)

# Data Transformation

In [13]:
# # standardize the coninuous features 
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# for i in range(X_train.shape[1]):
#     X_train.iloc[:, i] = sc.fit_transform(X_train.iloc[:, i].values.reshape(-1,1))
#     X_test.iloc[:, i] = sc.transform(X_test.iloc[:, i].values.reshape(-1,1))

# X_train = X_train.dropna(axis = 0)
# X_test = X_test.dropna(axis = 0)

In [14]:
X_train.head()

Unnamed: 0,FG_AVG,FGA_AVG,FG%_AVG,2P_AVG,2PA_AVG,2P%_AVG,3P_AVG,3PA_AVG,3P%_AVG,FT_AVG,...,AST%_players_AVG,STL%_players_AVG,BLK%_players_AVG,TOV%_players_AVG,USG%_players_AVG,ORtg_players_AVG,DRtg_players_AVG,BPM_players_AVG,A/T_AVG,FG_AVG_AVG
919,-4.7,-6.8,-0.02,-8.9,-16.0,-0.0147,4.2,9.2,0.0303,5.1,...,2.430715,-0.113888,-0.62035,4.384247,-0.203234,-0.292201,2.035861,0.140996,-0.819015,-0.175958
598,-1.9,2.9,-0.0361,0.3,7.5,-0.0684,-2.2,-4.6,-0.0128,3.6,...,-1.920106,-0.250308,0.047081,-2.355621,-0.398514,-10.843264,-3.917035,0.077499,-0.114248,-2.379104
1069,-0.5,3.1,-0.0202,-4.4,-3.9,-0.0422,3.9,7.0,0.0318,0.2,...,0.198642,-0.297794,0.270507,1.135591,0.009067,-6.694389,-8.831611,0.302174,-0.283942,0.717292
855,0.3,4.8,-0.0206,2.5,5.9,-0.0134,-2.2,-1.1,-0.0467,-0.3,...,-2.196693,-0.531835,0.161914,0.605864,-0.411165,-9.800951,0.681312,-0.051518,-0.398359,0.388583
1131,-1.6,-0.6,-0.0137,-1.7,-5.2,0.0219,0.1,4.6,-0.0463,5.2,...,0.124578,0.119547,-0.731026,2.332434,0.212445,-3.58191,-10.512507,0.43666,-0.60128,1.017479


In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 59), (201, 59), (800,), (201,))

# Feature Selection

In [16]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_feature_selection = pd.DataFrame(columns = ['Columns Number', 'Accuracy'])

for i in range(1,int(X_train.shape[1]/5)):
    selector = SelectKBest(f_classif, k=i*5)
    X_train_processed = selector.fit_transform(X_train, y_train)
    X_test_processed = selector.transform(X_test)

    classifier = RandomForestClassifier(n_jobs=2, random_state=0)

    classifier.fit(X_train_processed, y_train)
    y_pred = classifier.predict(X_test_processed)
    
    df_feature_selection.loc[i] = [5*i, str(accuracy_score(y_test, y_pred))]

In [17]:
df_feature_selection

Unnamed: 0,Columns Number,Accuracy
1,5,0.5422885572139303
2,10,0.5870646766169154
3,15,0.6069651741293532
4,20,0.6019900497512438
5,25,0.6019900497512438
6,30,0.5870646766169154
7,35,0.6218905472636815
8,40,0.6119402985074627
9,45,0.6368159203980099
10,50,0.582089552238806


In [18]:
best = df_feature_selection[df_feature_selection['Accuracy'] 
                            == df_feature_selection.Accuracy.max()]['Columns Number'].tolist()
best = int(best[0])

In [19]:
selector = SelectKBest(f_classif, k=best)
selector.fit(X_train, y_train)

SelectKBest(k=45)

In [20]:
cols = selector.get_support(indices=True)
X_train = X_train.iloc[:,cols]
X_test = X_test.iloc[:,cols]

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 45), (201, 45), (800,), (201,))

# Write out CSV files

In [22]:
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

In [23]:
df_train.to_csv('./df_train.csv')
df_test.to_csv('./df_test.csv')