### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
df = pd.read_csv('dataset/word2vec/tfidf_stem_1.csv', index_col=0)
df.head()

Unnamed: 0,антън,бих,боксьор,бъда,бях,възможност,двубо,джошуа,има,иска,...,успяв,харесв,дейст,милион,предполож,бавен,вали,дъжд,маро,очак
0,0.171644,0.154093,0.280958,0.129356,0.098191,0.196381,0.104618,0.154093,0.059032,0.06233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.219982,0.0,0.0,0.0,0.0,0.0,0.0,0.197488,0.0,0.039942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.228183,0.0,0.0,0.0,0.0,0.0,0.047943,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.093938,0.0,0.0,0.0,0.056475,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df_class = pd.read_csv('dataset/features_norm.csv', index_col=0)
df_class.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,...,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year,Class
0,0.0,0.088889,0.391304,1.0,0.125461,0.774908,0.948718,0.409091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,0.0
1,0.0,0.733333,0.391304,1.0,0.118081,0.811808,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.377778,0.108696,1.0,0.774908,0.125461,0.666667,0.909091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,1.0
3,0.0,0.377778,0.73913,1.0,0.811808,0.118081,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.377778,0.130435,1.0,0.664207,0.059041,0.641026,1.0,1.0,1.0,...,0.833333,0.346626,0.0,0.3,0.333333,0.340426,0.833333,0.340426,0.777778,1.0


In [4]:
df['Class'] = df_class['Class']
df.head()

Unnamed: 0,антън,бих,боксьор,бъда,бях,възможност,двубо,джошуа,има,иска,...,харесв,дейст,милион,предполож,бавен,вали,дъжд,маро,очак,Class
0,0.171644,0.154093,0.280958,0.129356,0.098191,0.196381,0.104618,0.154093,0.059032,0.06233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.219982,0.0,0.0,0.0,0.0,0.0,0.0,0.197488,0.0,0.039942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.228183,0.0,0.0,0.0,0.0,0.0,0.047943,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.093938,0.0,0.0,0.0,0.056475,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# df_norm = pd.read_csv('Dataset/pool_matches_numerical_binary_classified_normalized.csv', index_col=0)
# df_norm.head()
df_norm = df.copy()

### Do sampling if needed

Stratified Sampling (ratio = 0.1)

In [6]:
def sampling(df, sampling=True, ratio=0.1):
    if not sampling:
        print('NOT SAMPLING!')
        return df
    
    sample_1 = df.loc[df.Class==1].sample(frac=ratio, replace=False)
    print('label 1 sample size:', str(sample_1.shape[0]))
    sample_0 = df.loc[df.Class==0].sample(frac=ratio, replace=False)
    print('label 0 sample size:', str(sample_0.shape[0]))
    application = pd.concat([sample_1, sample_0], axis=0) #.sort_values('Match_ID')
    
    return application

In [7]:
application = sampling(df.copy(), sampling=False)

NOT SAMPLING!


Impute missing values

In [8]:
categorical_list = []
numerical_list = []
for i in application.columns.tolist():
    if application[i].dtype=='object':
        categorical_list.append(i)
    else:
        numerical_list.append(i)
print('Number of categorical features:', str(len(categorical_list)))
print('Number of numerical features:', str(len(numerical_list)))

Number of categorical features: 0
Number of numerical features: 1282


In [9]:
# from sklearn.preprocessing import Imputer
# application[numerical_list] = Imputer(strategy='median').fit_transform(application[numerical_list])

Deal with Categorical features: OneHotEncoding

In [10]:
# del df; gc.collect()
# application = pd.get_dummies(application, drop_first=True)
# print(application.shape)

Feature matrix and target

In [11]:
X = application.drop(['Class'], axis=1)
y = application.Class
feature_name = X.columns.tolist()

### Feature Selection
- select ___100___ features from ___226___
- ***xxx_support***: list to represent select this feature or not
- ***xxx_feature***: the name of selected features

### 1 Filter

#### 1.1 Pearson Correlation

___Note___
- Normalization: no
- Impute missing values: yes

In [12]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [13]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


#### 1.2 Chi-2
___Note___
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x000000000C356598>)

In [15]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


### 2 Wrapper

___Note___
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR

In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 1281 features.
Fitting estimator with 1271 features.
Fitting estimator with 1261 features.
Fitting estimator with 1251 features.
Fitting estimator with 1241 features.
Fitting estimator with 1231 features.
Fitting estimator with 1221 features.
Fitting estimator with 1211 features.
Fitting estimator with 1201 features.
Fitting estimator with 1191 features.
Fitting estimator with 1181 features.
Fitting estimator with 1171 features.
Fitting estimator with 1161 features.
Fitting estimator with 1151 features.
Fitting estimator with 1141 features.
Fitting estimator with 1131 features.
Fitting estimator with 1121 features.
Fitting estimator with 1111 features.
Fitting estimator with 1101 features.
Fitting estimator with 1091 features.
Fitting estimator with 1081 features.
Fitting estimator with 1071 features.
Fitting estimator with 1061 features.
Fitting estimator with 1051 features.
Fitting estimator with 1041 features.
Fitting estimator with 1031 features.
Fitting esti

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=100, step=10, verbose=5)

In [17]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


### 3. Embeded

#### 3.1 Logistics Regression L1
___Note___
- Normalization: Yes
- Impute missing values: Yes

In [18]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [19]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

1281 selected features


#### 3.2 Random Forest
___Note___
- Normalization: No
- Impute missing values: Yes

In [20]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [21]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

1281 selected features


#### 3.3 LightGBM
___Note___
- Normalization: No
- Impute missing values: No

In [22]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=40, min_split_gain=0.01,
        n_estimators=500, n_jobs=-1, num_leaves=32, objective=None,
        random_state=None, reg_alpha=3, reg_lambda=1, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
        norm_order=1, prefit=False, threshold='1.25*median')

In [23]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

1281 selected features


### Summary

In [24]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Chi-2,Feature,LightGBM,Logistics,Pearson,RFE,Random Forest,Total
1,True,чувст,True,True,True,True,True,6
2,True,форм,True,True,True,True,True,6
3,True,фенов,True,True,True,True,True,6
4,True,факт,True,True,True,True,True,6
5,True,участва,True,True,True,True,True,6
6,True,уважава,True,True,True,True,True,6
7,True,треньор,True,True,True,True,True,6
8,True,тренировъч,True,True,True,True,True,6
9,True,тежъ,True,True,True,True,True,6
10,True,такав,True,True,True,True,True,6


### Embeded Random Forest Features

In [25]:
embeded_rf_feature

['антън',
 'бих',
 'боксьор',
 'бъда',
 'бях',
 'възможност',
 'двубо',
 'джошуа',
 'има',
 'иска',
 'категор',
 'кличк',
 'лагер',
 'найсил',
 'намир',
 'откол',
 'отлич',
 'подар',
 'пока',
 'полош',
 'послаб',
 'прове',
 'ринг',
 'света',
 'силен',
 'среща',
 'тежк',
 'тренировъч',
 'фенов',
 'форм',
 'хубав',
 'щастлив',
 'битк',
 'бъде',
 'бързин',
 'видеоклип',
 'види',
 'видя',
 'възползва',
 'гледа',
 'говор',
 'гонг',
 'готов',
 'движен',
 'знам',
 'игр',
 'игров',
 'изненада',
 'изправ',
 'импровизира',
 'майк',
 'мисл',
 'надява',
 'найваж',
 'негов',
 'обработва',
 'ост',
 'план',
 'подготв',
 'подценяв',
 'приближава',
 'придържа',
 'работ',
 'разб',
 'слабос',
 'страшн',
 'тайсън',
 'твърд',
 'точност',
 'удар',
 'фасулс',
 'фокусир',
 'шокира',
 'бърз',
 'бързи',
 'голям',
 'игра',
 'качест',
 'лоурънс',
 'найиздръжлив',
 'найлек',
 'настр',
 'означав',
 'ока',
 'околи',
 'остав',
 'подоб',
 'полутеж',
 'притежав',
 'резк',
 'силн',
 'спаринг',
 'спече',
 'сравнени',
 'с

### Top Features

In [26]:
top_1_features_df = feature_selection_df.loc[feature_selection_df['Total']==6]
top_1_features = top_1_features_df['Feature']

In [27]:
top_2_features_df = feature_selection_df.loc[feature_selection_df['Total']==5]
top_2_features = top_2_features_df['Feature']

In [28]:
top_1_features

1          чувст
2           форм
3          фенов
4           факт
5        участва
6        уважава
7        треньор
8     тренировъч
9           тежъ
10         такав
11         стойк
12         стане
13        способ
14       специал
15          софи
16         смята
17         силен
18        решени
19        разбир
20         пулев
21         прият
22         почив
23     постижени
24        помогн
25      получава
26       получав
27          полу
28       оценява
29           общ
30     необходим
31         натат
32        направ
33      напослед
34        найваж
35          нада
36          мяст
37           моя
38       мотивир
39          кубр
40          край
41          клей
42        качест
43          кара
44          какв
45          казв
46          кажа
47        истинс
48     интересув
49           ина
50         изобщ
51           изл
52          земя
53         здрав
54       единств
55          дълж
56          дока
57          деца
58          гард
59         вяр

In [29]:
top_2_features

66          чужби
67          чудес
68           цели
69     състезател
70          среща
71          следв
72            път
73         публик
74         процес
75        продълж
76         прецен
77        предлож
78           пома
79       подготвя
80        подготв
81           план
82          обрат
83       нокаутир
84           ника
85           наяс
86          момче
87           млад
88           личн
89           кола
90           клуб
91            игр
92          други
93         доказв
94           добр
95          двубо
96          голям
97           голя
98           върх
99          втора
100         видим
101         важен
Name: Feature, dtype: object

### Filters

In [30]:
top_1_features = list(top_1_features)

In [31]:
top_1_features_class = top_1_features + ['Class']

In [32]:
top_2_features = list(top_2_features)

In [33]:
top_2_features_class = top_2_features + ['Class']

In [34]:
top_1_and_2_features = top_1_features + top_2_features

In [35]:
top_1_and_2_features_class = top_1_and_2_features + ['Class']

### New Dataframes

In [36]:
df_top_1 = df.loc[:, top_1_features_class]
df_top_1_and_2 = df.loc[:, top_1_and_2_features_class]

### Saving Dataframes

In [37]:
file_name = 'dataset/w2v_fs/df_top_1_stem_1.csv'
df_top_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [38]:
file_name = 'dataset/w2v_fs/df_top_1_and_2_stem_1.csv'
df_top_1_and_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)