### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
df = pd.read_csv('dataset/word2vec/tfidf_stem_3.csv', index_col=0)
df.head()

Unnamed: 0,антън,бих,боксьор,бъда,бях,възможност,двубой,джошуа,имам,искам,...,действ,милион,предполож,финалния,бавен,вали,дъжд,кортът,мароко,смята
0,0.167158,0.150066,0.273615,0.125975,0.095624,0.167158,0.101884,0.150066,0.084792,0.071534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.212419,0.0,0.0,0.0,0.0,0.0,0.0,0.190698,0.0,0.045451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.211327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.088935,0.0,0.0,0.0,0.07886,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df_class = pd.read_csv('dataset/features_norm.csv', index_col=0)
df_class.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,...,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year,Class
0,0.0,0.088889,0.391304,1.0,0.125461,0.774908,0.948718,0.409091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,0.0
1,0.0,0.733333,0.391304,1.0,0.118081,0.811808,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.377778,0.108696,1.0,0.774908,0.125461,0.666667,0.909091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,1.0
3,0.0,0.377778,0.73913,1.0,0.811808,0.118081,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.377778,0.130435,1.0,0.664207,0.059041,0.641026,1.0,1.0,1.0,...,0.833333,0.346626,0.0,0.3,0.333333,0.340426,0.833333,0.340426,0.777778,1.0


In [4]:
df['Class'] = df_class['Class']
df.head()

Unnamed: 0,антън,бих,боксьор,бъда,бях,възможност,двубой,джошуа,имам,искам,...,милион,предполож,финалния,бавен,вали,дъжд,кортът,мароко,смята,Class
0,0.167158,0.150066,0.273615,0.125975,0.095624,0.167158,0.101884,0.150066,0.084792,0.071534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.212419,0.0,0.0,0.0,0.0,0.0,0.0,0.190698,0.0,0.045451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.211327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.088935,0.0,0.0,0.0,0.07886,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# df_norm = pd.read_csv('Dataset/pool_matches_numerical_binary_classified_normalized.csv', index_col=0)
# df_norm.head()
df_norm = df.copy()

### Do sampling if needed

Stratified Sampling (ratio = 0.1)

In [6]:
def sampling(df, sampling=True, ratio=0.1):
    if not sampling:
        print('NOT SAMPLING!')
        return df
    
    sample_1 = df.loc[df.Class==1].sample(frac=ratio, replace=False)
    print('label 1 sample size:', str(sample_1.shape[0]))
    sample_0 = df.loc[df.Class==0].sample(frac=ratio, replace=False)
    print('label 0 sample size:', str(sample_0.shape[0]))
    application = pd.concat([sample_1, sample_0], axis=0) #.sort_values('Match_ID')
    
    return application

In [7]:
application = sampling(df.copy(), sampling=False)

NOT SAMPLING!


Impute missing values

In [8]:
categorical_list = []
numerical_list = []
for i in application.columns.tolist():
    if application[i].dtype=='object':
        categorical_list.append(i)
    else:
        numerical_list.append(i)
print('Number of categorical features:', str(len(categorical_list)))
print('Number of numerical features:', str(len(numerical_list)))

Number of categorical features: 0
Number of numerical features: 1454


In [9]:
# from sklearn.preprocessing import Imputer
# application[numerical_list] = Imputer(strategy='median').fit_transform(application[numerical_list])

Deal with Categorical features: OneHotEncoding

In [10]:
# del df; gc.collect()
# application = pd.get_dummies(application, drop_first=True)
# print(application.shape)

Feature matrix and target

In [11]:
X = application.drop(['Class'], axis=1)
y = application.Class
feature_name = X.columns.tolist()

### Feature Selection
- select ___100___ features from ___226___
- ***xxx_support***: list to represent select this feature or not
- ***xxx_feature***: the name of selected features

### 1 Filter

#### 1.1 Pearson Correlation

___Note___
- Normalization: no
- Impute missing values: yes

In [12]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [13]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


#### 1.2 Chi-2
___Note___
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x000000000C3DA510>)

In [15]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


### 2 Wrapper

___Note___
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR

In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 1453 features.
Fitting estimator with 1443 features.
Fitting estimator with 1433 features.
Fitting estimator with 1423 features.
Fitting estimator with 1413 features.
Fitting estimator with 1403 features.
Fitting estimator with 1393 features.
Fitting estimator with 1383 features.
Fitting estimator with 1373 features.
Fitting estimator with 1363 features.
Fitting estimator with 1353 features.
Fitting estimator with 1343 features.
Fitting estimator with 1333 features.
Fitting estimator with 1323 features.
Fitting estimator with 1313 features.
Fitting estimator with 1303 features.
Fitting estimator with 1293 features.
Fitting estimator with 1283 features.
Fitting estimator with 1273 features.
Fitting estimator with 1263 features.
Fitting estimator with 1253 features.
Fitting estimator with 1243 features.
Fitting estimator with 1233 features.
Fitting estimator with 1223 features.
Fitting estimator with 1213 features.
Fitting estimator with 1203 features.
Fitting esti

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=100, step=10, verbose=5)

In [17]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


### 3. Embeded

#### 3.1 Logistics Regression L1
___Note___
- Normalization: Yes
- Impute missing values: Yes

In [18]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [19]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

1453 selected features


#### 3.2 Random Forest
___Note___
- Normalization: No
- Impute missing values: Yes

In [20]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [21]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

1453 selected features


#### 3.3 LightGBM
___Note___
- Normalization: No
- Impute missing values: No

In [22]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=40, min_split_gain=0.01,
        n_estimators=500, n_jobs=-1, num_leaves=32, objective=None,
        random_state=None, reg_alpha=3, reg_lambda=1, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
        norm_order=1, prefit=False, threshold='1.25*median')

In [23]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

1453 selected features


### Summary

In [24]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Chi-2,Feature,LightGBM,Logistics,Pearson,RFE,Random Forest,Total
1,True,шанс,True,True,True,True,True,6
2,True,чувств,True,True,True,True,True,6
3,True,форма,True,True,True,True,True,6
4,True,феновет,True,True,True,True,True,6
5,True,факта,True,True,True,True,True,6
6,True,участва,True,True,True,True,True,6
7,True,уважава,True,True,True,True,True,6
8,True,треньор,True,True,True,True,True,6
9,True,тренировъч,True,True,True,True,True,6
10,True,тежък,True,True,True,True,True,6


### Embeded Random Forest Features

In [25]:
embeded_rf_feature

['антън',
 'бих',
 'боксьор',
 'бъда',
 'бях',
 'възможност',
 'двубой',
 'джошуа',
 'имам',
 'искам',
 'категори',
 'кличко',
 'лагер',
 'найсил',
 'намира',
 'отколко',
 'отлич',
 'подаря',
 'покаже',
 'полоша',
 'послаб',
 'проведох',
 'ринга',
 'света',
 'силен',
 'среща',
 'тежка',
 'тренировъч',
 'феновет',
 'форма',
 'хубав',
 'щастлив',
 'битка',
 'биткат',
 'бъдете',
 'бързин',
 'видеоклипове',
 'видите',
 'видят',
 'видях',
 'възползва',
 'гледаме',
 'говоря',
 'гонгът',
 'готов',
 'движени',
 'знам',
 'играт',
 'игров',
 'изненада',
 'изправ',
 'импровизира',
 'майк',
 'мисли',
 'надява',
 'найваж',
 'негови',
 'обработва',
 'остан',
 'план',
 'подготв',
 'подценяв',
 'подценява',
 'покажа',
 'приближава',
 'придържа',
 'работа',
 'работи',
 'разбер',
 'слабост',
 'страшна',
 'тайсън',
 'твърде',
 'точност',
 'удари',
 'фасулск',
 'фокусира',
 'шокира',
 'бързи',
 'голямо',
 'играем',
 'имах',
 'качеств',
 'лоурънс',
 'найиздръжлив',
 'найлек',
 'настран',
 'означав',
 'окаж

### Top Features

In [26]:
top_1_features_df = feature_selection_df.loc[feature_selection_df['Total']==6]
top_1_features = top_1_features_df['Feature']

In [27]:
top_2_features_df = feature_selection_df.loc[feature_selection_df['Total']==5]
top_2_features = top_2_features_df['Feature']

In [28]:
top_1_features

1           шанс
2         чувств
3          форма
4        феновет
5          факта
6        участва
7        уважава
8        треньор
9     тренировъч
10         тежък
11        такава
12        стойка
13         стане
14       срещата
15      способен
16       специал
17         софия
18         силен
19        самият
20        разбир
21           път
22         пулев
23       приятел
24       пресира
25        почивк
26     постижени
27        помогн
28        получи
29       оценява
30         остан
31          общо
32     нокаутира
33       неприят
34     необходим
35        нататъ
36        направ
37        найваж
38         надал
39         място
40           моя
41       миналат
42        кубрат
43          край
44          клей
45       качеств
46         карам
47        каквот
48          кажа
49       истинск
50    интересува
51         изобщ
52         играт
53          земя
54         здрав
55       единств
56         дължи
57         добри
58        децата
59         гол

In [29]:
top_2_features

68         чужбин
69        чудесно
70       фокусира
71          успях
72     състезател
73      страхотно
74       световен
75         радвам
76         процес
77        проблем
78        предлож
79         покаже
80      подобрият
81       подготвя
82        подготв
83           план
84          обрат
85          никак
86         наясно
87     напоследък
88         найдоб
89        мотивир
90         момчет
91           млад
92           клуб
93          искам
94          иначе
95        доказва
96         докажа
97         двубой
98          върха
99          видим
100         важен
101        бързин
Name: Feature, dtype: object

### Filters

In [30]:
top_1_features = list(top_1_features)

In [31]:
top_1_features_class = top_1_features + ['Class']

In [32]:
top_2_features = list(top_2_features)

In [33]:
top_2_features_class = top_2_features + ['Class']

In [34]:
top_1_and_2_features = top_1_features + top_2_features

In [35]:
top_1_and_2_features_class = top_1_and_2_features + ['Class']

### New Dataframes

In [36]:
df_top_1 = df.loc[:, top_1_features_class]
df_top_1_and_2 = df.loc[:, top_1_and_2_features_class]

### Saving Dataframes

In [37]:
file_name = 'dataset/w2v_fs/df_top_1_stem_3.csv'
df_top_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [38]:
file_name = 'dataset/w2v_fs/df_top_1_and_2_stem_3.csv'
df_top_1_and_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)