# Feature Selection

In [101]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white' # Since I use a dark IDE

# To allow multiple outputs per cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection  import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [102]:
wba_data = pd.read_csv('wba_data_CLEAN.csv')
wba_data.head()

Unnamed: 0.1,Unnamed: 0,Subject,Gender,DOT,DOB,AgeM,AgeY,Ethnicity,PTA0.5,PTA1,...,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,0,8,0,2016-05-12 00:00:00,2006-12-18 00:00:00,112,9.333333,0.0,20,20,...,-0.405697,-2.841202,1.594642,0.724436,-0.579289,-0.553364,0.704006,-0.409157,0.072477,-0.217327
1,1,22,0,2016-05-18 00:00:00,2006-10-27 00:00:00,114,9.5,0.0,20,20,...,1.479339,1.668937,-0.048483,0.608993,0.251156,-0.710949,0.604375,-0.61695,0.637559,-0.469144
2,2,24,1,2016-05-18 00:00:00,2006-07-02 00:00:00,118,9.833333,0.0,20,20,...,-0.343253,-0.215469,-0.777275,-1.70365,0.510768,-0.413348,-0.206985,-0.375959,0.339409,-0.049631
3,3,31,0,2016-05-19 00:00:00,2006-06-01 00:00:00,119,9.916667,0.0,20,20,...,-0.60394,1.065511,1.333238,0.796636,0.454654,-0.116502,0.247162,0.303749,-0.243954,-0.046242
4,4,42,0,2016-05-20 00:00:00,2008-06-03 00:00:00,95,7.916667,0.0,20,20,...,-0.434886,2.172756,0.895713,1.079165,1.445886,0.345314,-1.758196,0.268712,-0.148645,-0.204201


Removing PoF variables

In [105]:
features = list(wba_data.columns)
print(len(features))

for feature in features:
    if feature == "OverallPoF":
        continue
    elif feature[-3:] == "PoF" or feature[-4:] == "PoFC":
        print("Removing {}".format(feature))
        features.remove(feature)

print(len(features))


153
Removing PTAAv4FAPoF
Removing TympPoF
Removing OAEPoF
150


Removing PCAs ** have to run the cell 4 times?:

In [86]:
for i in range(4):
    for feature in features:
        if feature[:2] == "PC":
            print("Removing {}".format(feature))
            features.remove(feature)

# features

Removing PC1
Removing PC3
Removing PC5
Removing PC7
Removing PC9
Removing PC11
Removing PC13
Removing PC15
Removing PC2
Removing PC6
Removing PC10
Removing PC14
Removing PC4
Removing PC12
Removing PC8


In [87]:
unwanted_cols = ['Unnamed: 0', 'DOB', 'DOT', 'AgeM']

for col in unwanted_cols:
    print("Removing {}".format(col))
    features.remove(col)

Removing Unnamed: 0
Removing DOB
Removing DOT
Removing AgeM


In [99]:
features

['Unnamed: 0',
 'Subject',
 'Gender',
 'DOT',
 'DOB',
 'AgeM',
 'AgeY',
 'Ethnicity',
 'PTA0.5',
 'PTA1',
 'PTA2',
 'PTA4',
 'PTAAv4FA',
 'PTAAv4FAPoFC',
 'ECV',
 'TPP',
 'SC',
 'TympType',
 'OAE1',
 'OAE1.4',
 'OAE2',
 'OAE2.8',
 'OAE4',
 'OAEPoFC',
 'OverallPoF',
 'EarSide',
 'Ear coded',
 'Pressure',
 'f(226.0000)',
 'f(257.3256)',
 'f(280.6155)',
 'f(297.3018)',
 'f(324.2099)',
 'f(343.4884)',
 'f(363.9133)',
 'f(385.5527)',
 'f(408.4789)',
 'f(432.7683)',
 'f(458.5020)',
 'f(471.9372)',
 'f(500.0000)',
 'f(514.6511)',
 'f(545.2539)',
 'f(561.2310)',
 'f(577.6763)',
 'f(594.6036)',
 'f(629.9605)',
 'f(648.4198)',
 'f(667.4199)',
 'f(686.9768)',
 'f(707.1068)',
 'f(727.8266)',
 'f(749.1535)',
 'f(771.1054)',
 'f(793.7005)',
 'f(816.9577)',
 'f(840.8964)',
 'f(865.5366)',
 'f(890.8987)',
 'f(917.0040)',
 'f(943.8743)',
 'f(971.5319)',
 'f(1000.0000)',
 'f(1029.3022)',
 'f(1059.4631)',
 'f(1090.5077)',
 'f(1122.4620)',
 'f(1155.3527)',
 'f(1189.2071)',
 'f(1224.0535)',
 'f(1259.9210)'

In [98]:
y = wba_data['OverallPoF']

X = wba_data[features]
X = X.loc[:, X.columns != 'OverallPoF']

# X = X.drop(['Unnamed: 0', 'DOB', 'DOT', 'AgeM'], axis=1)
X.columns

Index(['Unnamed: 0', 'Subject', 'Gender', 'DOT', 'DOB', 'AgeM', 'AgeY',
       'Ethnicity', 'PTA0.5', 'PTA1',
       ...
       'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14',
       'PC15'],
      dtype='object', length=149)

In [36]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [40]:
feature_select = SelectFromModel(
    RandomForestClassifier(n_estimators = 100)
    )

feature_select.fit(X_train, y_train)

ValueError: could not convert string to float: '2016-10-05 00:00:00'

In [39]:
feature_select.get_support()

ValueError: Either fit the model before transform or set "prefit=True" while passing the fitted estimator to the constructor.