#### Going from 55 features down to the most important 10 without much loss of accuracy

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [2]:
df = pd.read_csv('./Dataset/train_selected_cleaned.csv')

In [3]:
df.shape
# We have 55 features as of now

(1200, 56)

In [4]:
X = np.array(df.loc[:, df.columns != 'prognosis'])
y = np.array(df.loc[:, df.columns == 'prognosis'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 37)

In [6]:
rf = RandomForestClassifier(n_estimators=1000, random_state=0)

In [10]:
selector = RFE(rf, 6)

In [11]:
selector = selector.fit(X_train, y_train.ravel())

In [9]:
selector.score(X_test, y_test) # 97% accuracy with 10 features

0.9666666666666667

In [11]:
print(selector.ranking_) # feature ranking

[13  1 27  1 19 31 28  1 20 26  1 46 43 33 45 12 14 17  1  1  9  2 15  4
 40 18 24 30 34 44 23 35 29  3 22 11  1  1 36  5 16  1 41  8 21 38 42  1
 37 32 39  6  7 25 10]


In [12]:
selected = []
for i in range(len(selector.ranking_)):
    if selector.ranking_[i] == 1:
        selected.append(df.columns[i])

In [13]:
selected

['skin_rash',
 'continuous_sneezing',
 'acidity',
 'fatigue',
 'nausea',
 'loss_of_appetite',
 'chest_pain',
 'fast_heart_rate',
 'bladder_discomfort',
 'muscle_pain']

In [14]:
selected.append('prognosis')

In [15]:
df = df.loc[:, df.columns.isin(selected)]

In [16]:
df.to_csv('./Dataset/train_selected_cleaned_reduced.csv', index=False)