### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
df = pd.read_csv('dataset/features_norm.csv', index_col=0)
df.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,...,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year,Class
0,0.0,0.088889,0.391304,1.0,0.125461,0.774908,0.948718,0.409091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,0.0
1,0.0,0.733333,0.391304,1.0,0.118081,0.811808,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.377778,0.108696,1.0,0.774908,0.125461,0.666667,0.909091,1.0,1.0,...,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,1.0
3,0.0,0.377778,0.73913,1.0,0.811808,0.118081,0.692308,0.454545,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.377778,0.130435,1.0,0.664207,0.059041,0.641026,1.0,1.0,1.0,...,0.833333,0.346626,0.0,0.3,0.333333,0.340426,0.833333,0.340426,0.777778,1.0


In [None]:
# df_norm = pd.read_csv('Dataset/pool_matches_numerical_binary_classified_normalized.csv', index_col=0)
# df_norm.head()
df_norm = df.copy()

### Do sampling if needed

Stratified Sampling (ratio = 0.1)

In [3]:
def sampling(df, sampling=True, ratio=0.1):
    if not sampling:
        print('NOT SAMPLING!')
        return df
    
    sample_1 = df.loc[df.Class==1].sample(frac=ratio, replace=False)
    print('label 1 sample size:', str(sample_1.shape[0]))
    sample_0 = df.loc[df.Class==0].sample(frac=ratio, replace=False)
    print('label 0 sample size:', str(sample_0.shape[0]))
    application = pd.concat([sample_1, sample_0], axis=0) #.sort_values('Match_ID')
    
    return application

In [4]:
application = sampling(df.copy(), sampling=False)

NOT SAMPLING!


Impute missing values

In [5]:
categorical_list = []
numerical_list = []
for i in application.columns.tolist():
    if application[i].dtype=='object':
        categorical_list.append(i)
    else:
        numerical_list.append(i)
print('Number of categorical features:', str(len(categorical_list)))
print('Number of numerical features:', str(len(numerical_list)))

Number of categorical features: 0
Number of numerical features: 23


In [None]:
# from sklearn.preprocessing import Imputer
# application[numerical_list] = Imputer(strategy='median').fit_transform(application[numerical_list])

Deal with Categorical features: OneHotEncoding

In [None]:
# del df; gc.collect()
# application = pd.get_dummies(application, drop_first=True)
# print(application.shape)

Feature matrix and target

In [6]:
X = application.drop(['Class'], axis=1)
y = application.Class
feature_name = X.columns.tolist()

### Feature Selection
- select ___100___ features from ___226___
- ***xxx_support***: list to represent select this feature or not
- ***xxx_feature***: the name of selected features

### 1 Filter

#### 1.1 Pearson Correlation

___Note___
- Normalization: no
- Impute missing values: yes

In [7]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [8]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

22 selected features


#### 1.2 Chi-2
___Note___
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=10)
chi_selector.fit(X_norm, y)

SelectKBest(k=10, score_func=<function chi2 at 0x000000000B0DB268>)

In [11]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

10 selected features


### 2 Wrapper

___Note___
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=100, step=10, verbose=5)

In [13]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

22 selected features


### 3. Embeded

#### 3.1 Logistics Regression L1
___Note___
- Normalization: Yes
- Impute missing values: Yes

In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [15]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

22 selected features


#### 3.2 Random Forest
___Note___
- Normalization: No
- Impute missing values: Yes

In [16]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

In [17]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

8 selected features


#### 3.3 LightGBM
___Note___
- Normalization: No
- Impute missing values: No

In [18]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=40, min_split_gain=0.01,
        n_estimators=500, n_jobs=-1, num_leaves=32, objective=None,
        random_state=None, reg_alpha=3, reg_lambda=1, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
        norm_order=1, prefit=False, threshold='1.25*median')

In [19]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

22 selected features


### Summary

In [20]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Chi-2,Feature,LightGBM,Logistics,Pearson,RFE,Random Forest,Total
1,True,year,True,True,True,True,True,6
2,True,Rank Opp.,True,True,True,True,True,6
3,True,Rank,True,True,True,True,True,6
4,True,Prev. Match,True,True,True,True,True,6
5,True,Opponent,True,True,True,True,True,6
6,True,Interviewee,True,True,True,True,True,6
7,True,Confidence,True,True,True,True,True,6
8,True,day,True,True,True,True,False,5
9,True,Sex,True,True,True,True,False,5
10,True,Opp. Age,True,True,True,True,False,5


### Embeded Random Forest Features

In [21]:
embeded_rf_feature

['Interviewee',
 'Opponent',
 'Rank',
 'Rank Opp.',
 'Int. Age',
 'Prev. Match',
 'Confidence',
 'year']

### Top Features

In [22]:
top_1_features_df = feature_selection_df.loc[feature_selection_df['Total']==6]
top_1_features = top_1_features_df['Feature']

In [23]:
top_2_features_df = feature_selection_df.loc[feature_selection_df['Total']==5]
top_2_features = top_2_features_df['Feature']

In [24]:
top_1_features

1           year
2      Rank Opp.
3           Rank
4    Prev. Match
5       Opponent
6    Interviewee
7     Confidence
Name: Feature, dtype: object

In [25]:
top_2_features

8          day
9          Sex
10    Opp. Age
11    Int. Age
Name: Feature, dtype: object

### Filters

In [33]:
embeded_rf_feature_class = list(embeded_rf_feature) + ['Class']

In [27]:
top_1_features = list(top_1_features)

In [28]:
top_1_features_class = top_1_features + ['Class']

In [29]:
top_2_features = list(top_2_features)

In [30]:
top_2_features_class = top_2_features + ['Class']

In [31]:
top_1_and_2_features = top_1_features + top_2_features

In [32]:
top_1_and_2_features_class = top_1_and_2_features + ['Class']

### New Dataframes

In [34]:
df_top_1 = df.loc[:, top_1_features_class]
df_top_1_and_2 = df.loc[:, top_1_and_2_features_class]
df_embeded_rf_feature = df.loc[:, embeded_rf_feature_class]

### Saving Dataframes

In [35]:
file_name = 'dataset/fs/df_top_1.csv'
df_top_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [36]:
file_name = 'dataset/fs/df_top_1_and_2.csv'
df_top_1_and_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [37]:
file_name = 'dataset/fs/df_embeded_rf_feature.csv'
df_embeded_rf_feature.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)