# Determining Feature Importance with SciKitLearn Decision Trees

Use a SciKitLearn DecisionTree to determine the importance of a given set of features

In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [7]:
# read the csv
Kyphosis_df = pd.read_csv("kyphosis_encoded.csv")
Kyphosis_df

Unnamed: 0,Kyphosis,Age,Number,Start
0,0,71,3,5
1,0,158,3,14
2,1,128,4,5
3,0,2,5,1
4,0,1,4,15
...,...,...,...,...
76,1,157,3,13
77,0,26,7,13
78,0,120,2,13
79,1,42,7,6


In [8]:
# create dataframe for features
X = Kyphosis_df.drop(['Kyphosis'],axis=1) # feature columns
X

Unnamed: 0,Age,Number,Start
0,71,3,5
1,158,3,14
2,128,4,5
3,2,5,1
4,1,4,15
...,...,...,...
76,157,3,13
77,26,7,13
78,120,2,13
79,42,7,6


In [9]:
# create dataframe for target
y = Kyphosis_df['Kyphosis'] # target column
y

0     0
1     0
2     1
3     0
4     0
     ..
76    1
77    0
78    0
79    1
80    0
Name: Kyphosis, Length: 81, dtype: int64

In [10]:
# initialise a decision tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X, y)

DecisionTreeClassifier()

In [11]:
# use decision tree to determine feature importance

labels = ['Age', 'Number', 'Start']

feature_importances = pd.DataFrame(decision_tree.feature_importances_,
                                   index = labels,
                                   columns=['importance']).sort_values('importance',ascending=False)

print(feature_importances)

        importance
Start     0.476114
Age       0.424621
Number    0.099265
