# Feature Selection

In [41]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white' # Since I use a dark IDE

# To allow multiple outputs per cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection  import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [42]:
wba_data = pd.read_csv('wba_data_CLEAN.csv')
wba_data.head()

Unnamed: 0.1,Unnamed: 0,Subject,Gender,DOT,DOB,AgeM,AgeY,Ethnicity,PTA0.5,PTA1,...,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,0,8,0,2016-05-12 00:00:00,2006-12-18 00:00:00,112,9.333333,0.0,20,20,...,-0.405697,-2.841202,1.594642,0.724436,-0.579289,-0.553364,0.704006,-0.409157,0.072477,-0.217327
1,1,22,0,2016-05-18 00:00:00,2006-10-27 00:00:00,114,9.5,0.0,20,20,...,1.479339,1.668937,-0.048483,0.608993,0.251156,-0.710949,0.604375,-0.61695,0.637559,-0.469144
2,2,24,1,2016-05-18 00:00:00,2006-07-02 00:00:00,118,9.833333,0.0,20,20,...,-0.343253,-0.215469,-0.777275,-1.70365,0.510768,-0.413348,-0.206985,-0.375959,0.339409,-0.049631
3,3,31,0,2016-05-19 00:00:00,2006-06-01 00:00:00,119,9.916667,0.0,20,20,...,-0.60394,1.065511,1.333238,0.796636,0.454654,-0.116502,0.247162,0.303749,-0.243954,-0.046242
4,4,42,0,2016-05-20 00:00:00,2008-06-03 00:00:00,95,7.916667,0.0,20,20,...,-0.434886,2.172756,0.895713,1.079165,1.445886,0.345314,-1.758196,0.268712,-0.148645,-0.204201


## Removing Unwanted Variables

Removing PoF variables

In [43]:
'abcdefghijk'[-4:]

'OAEPoFC'[-4:]

'hijk'

'PoFC'

In [44]:
features = list(wba_data.columns)
print(len(features))

for feature in wba_data.columns:
    # print(feature)
    if feature == "OverallPoF":
        continue
    elif feature[-3:] == "PoF":
        print("Removing {}".format(feature))
        features.remove(feature)
    elif feature[-4:] == 'PoFC':
        print("Removing {}".format(feature))
        features.remove(feature)


print(len(features))


153
Removing PTAAv4FAPoF
Removing PTAAv4FAPoFC
Removing TympPoF
Removing OAEPoF
Removing OAEPoFC
148


In [45]:
print(features)

['Unnamed: 0', 'Subject', 'Gender', 'DOT', 'DOB', 'AgeM', 'AgeY', 'Ethnicity', 'PTA0.5', 'PTA1', 'PTA2', 'PTA4', 'PTAAv4FA', 'ECV', 'TPP', 'SC', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4', 'OverallPoF', 'EarSide', 'Ear coded', 'Pressure', 'f(226.0000)', 'f(257.3256)', 'f(280.6155)', 'f(297.3018)', 'f(324.2099)', 'f(343.4884)', 'f(363.9133)', 'f(385.5527)', 'f(408.4789)', 'f(432.7683)', 'f(458.5020)', 'f(471.9372)', 'f(500.0000)', 'f(514.6511)', 'f(545.2539)', 'f(561.2310)', 'f(577.6763)', 'f(594.6036)', 'f(629.9605)', 'f(648.4198)', 'f(667.4199)', 'f(686.9768)', 'f(707.1068)', 'f(727.8266)', 'f(749.1535)', 'f(771.1054)', 'f(793.7005)', 'f(816.9577)', 'f(840.8964)', 'f(865.5366)', 'f(890.8987)', 'f(917.0040)', 'f(943.8743)', 'f(971.5319)', 'f(1000.0000)', 'f(1029.3022)', 'f(1059.4631)', 'f(1090.5077)', 'f(1122.4620)', 'f(1155.3527)', 'f(1189.2071)', 'f(1224.0535)', 'f(1259.9210)', 'f(1296.8396)', 'f(1334.8399)', 'f(1373.9536)', 'f(1414.2136)', 'f(1455.6532)', 'f(1498.3071)',

Removing PCAs ** have to run the cell 4 times?:

In [46]:
for feature in wba_data.columns:
    if feature[:2] == "PC":
        print("Removing {}".format(feature))
        features.remove(feature)

# features

Removing PC1
Removing PC2
Removing PC3
Removing PC4
Removing PC5
Removing PC6
Removing PC7
Removing PC8
Removing PC9
Removing PC10
Removing PC11
Removing PC12
Removing PC13
Removing PC14
Removing PC15


In [47]:
unwanted_cols = ['Unnamed: 0', 'DOB', 'DOT', 'AgeM']

for col in unwanted_cols:
    print("Removing {}".format(col))
    features.remove(col)

Removing Unnamed: 0
Removing DOB
Removing DOT
Removing AgeM


## Converting TympType to Numerical

In [48]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

ohe_tymptype = encoder.fit_transform(wba_data[['TympType']])

print(ohe_tymptype.toarray()[1:10])

ohe_tymptype = pd.DataFrame(ohe_tymptype)

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [49]:
wba_data['TympType'] = ohe_tymptype

## Train Test Split

In [59]:
y = wba_data['OverallPoF']

X = wba_data[features]
X = X.loc[:, X.columns != 'OverallPoF']

### TEMPORARY
X = X.loc[:, X.columns != 'TympType']
X = X.loc[:, X.columns != 'EarSide']



# X = X.drop(['Unnamed: 0', 'DOB', 'DOT', 'AgeM'], axis=1)
X.columns

Index(['Subject', 'Gender', 'AgeY', 'Ethnicity', 'PTA0.5', 'PTA1', 'PTA2',
       'PTA4', 'PTAAv4FA', 'ECV',
       ...
       'f(6168.8433)', 'f(6349.6042)', 'f(6535.6618)', 'f(6727.1713)',
       'f(6924.2925)', 'f(7127.1897)', 'f(7336.0323)', 'f(7550.9945)',
       'f(7772.2555)', 'f(8000.0000)'],
      dtype='object', length=126)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [74]:
len(X_train)
len(y_train)
len(X_test)
len(y_test)

215

215

24

24

In [67]:
# X_train.head()
# y_train.head()
# X_test.head()
# y_test.head()

In [81]:
feature_select = SelectFromModel(
    RandomForestClassifier(n_estimators = 200)
    )

feature_select.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=200))

In [86]:
feature_select.get_params(deep=True)

{'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 200,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(n_estimators=200),
 'importance_getter': 'auto',
 'max_features': None,
 'norm_order': 1,
 'prefit': False,
 'threshold': None}

In [102]:
imp_indices = feature_select.get_support(indices=True)
print(imp_indices)

imp = []

for i in imp_indices:
    imp.append(X.columns[i])
print(imp)

[10 12 13 14 15 16 45 49 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
 67 68 69 70 72]
['TPP', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4', 'f(793.7005)', 'f(890.8987)', 'f(943.8743)', 'f(971.5319)', 'f(1000.0000)', 'f(1029.3022)', 'f(1059.4631)', 'f(1090.5077)', 'f(1122.4620)', 'f(1155.3527)', 'f(1189.2071)', 'f(1224.0535)', 'f(1259.9210)', 'f(1296.8396)', 'f(1334.8399)', 'f(1373.9536)', 'f(1414.2136)', 'f(1455.6532)', 'f(1498.3071)', 'f(1542.2108)', 'f(1587.4011)', 'f(1633.9155)', 'f(1731.0731)']


In [82]:
important = feature_select.get_support()

In [83]:
print(X.columns[important])

Index(['TPP', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4', 'f(793.7005)',
       'f(890.8987)', 'f(943.8743)', 'f(971.5319)', 'f(1000.0000)',
       'f(1029.3022)', 'f(1059.4631)', 'f(1090.5077)', 'f(1122.4620)',
       'f(1155.3527)', 'f(1189.2071)', 'f(1224.0535)', 'f(1259.9210)',
       'f(1296.8396)', 'f(1334.8399)', 'f(1373.9536)', 'f(1414.2136)',
       'f(1455.6532)', 'f(1498.3071)', 'f(1542.2108)', 'f(1587.4011)',
       'f(1633.9155)', 'f(1731.0731)'],
      dtype='object')


In [None]:
# sns.set(rc={'figure.figsize':(30,15)})
# ax = sns.boxplot(x="variable", y="value", data=wba_df)
# ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
# ax.axhline(1.0, ls='--', color='red')
# ax.axhline(0.8, ls='--', color='red')
# ax.axhline(0.6, ls='--', color='red')
# ax.axhline(0.4, ls='--', color='red')
# ax.axhline(0.2, ls='--', color='red')
# ax.axhline(0.0, ls='--', color='red')

# sns.set(rc={'figure.figsize':(20,10)})
# plt.setp(ax.get_xticklabels()[::2], visible=False)

# plt.tight_layout()
# plt.show();

In [77]:
selected_feat= X_train.columns[(feature_select.get_support())]
len(selected_feat)

32