# Feature Importance

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sn
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
import HelpfulFunctions as hp

## Loading Data Wisconsin Breast Cancer Dataset

In [3]:
from sklearn.datasets import load_breast_cancer
breastCancerData = load_breast_cancer()
X = pd.DataFrame(breastCancerData['data'],columns = breastCancerData['feature_names'])
y = breastCancerData['target']

In [4]:
feature_names = X.columns
n_feats = len(feature_names)

## Standardising the Data

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_standardised = scaler.transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_standardised,y,test_size=0.3,random_state = 101)

## Fit Range of Models

In [7]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs')
log_model.fit(X_train,y_train)

LogisticRegression()

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)

KNeighborsClassifier()

In [9]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train,y_train)

DecisionTreeClassifier()

In [10]:
from sklearn.naive_bayes import GaussianNB
bayes_model = GaussianNB()
bayes_model.fit(X_train,y_train)

GaussianNB()

In [11]:
from sklearn.svm import SVC
svc_model = SVC(gamma = "auto")
svc_model.fit(X_train,y_train)

SVC(gamma='auto')

In [14]:
from sklearn.ensemble import RandomForestClassifier
rnd_model = RandomForestClassifier(n_estimators = 100)
rnd_model.fit(X_train,y_train)

RandomForestClassifier()

## Prediction Accuracy

In [15]:
model_list = [log_model,knn_model,tree_model,bayes_model,svc_model,mlp_model,rnd_model]

In [16]:
for model in model_list:
    predictions = model.predict(X_test)
    cm = confusion_matrix(y_test,predictions)
    print(model.__class__)
    print('Test Accuracy: %.3f' %model.score(X_test, y_test))

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Test Accuracy: 0.977
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Test Accuracy: 0.953
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Test Accuracy: 0.918
<class 'sklearn.naive_bayes.GaussianNB'>
Test Accuracy: 0.918
<class 'sklearn.svm._classes.SVC'>
Test Accuracy: 0.977
<class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>
Test Accuracy: 0.959
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test Accuracy: 0.953


## Feature Importance using Individual Features

In [17]:
model_list = [log_model,knn_model,tree_model,bayes_model,svc_model,rnd_model]

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
import numpy

In [20]:
for model in model_list:

    print("----------------------------------------------------")    
    print(model.__class__)
    scores_list = []

    for i in range(n_feats):
        X_one_feature = X_train[:, i].reshape(-1, 1)
        scores = cross_val_score(model, X_one_feature, y_train, cv=5)
        scores_mean = scores.mean()
        scores_list.append(scores.mean())

    sorted_indices = numpy.argsort(np.array(scores_list) * -1) # negate to have descending

    for i in range(0,5): # top 5 features
        index = sorted_indices[i]
        print(i, ":", feature_names[index], scores_list[index])
    
print("----------------------------------------------------")

----------------------------------------------------
<class 'sklearn.linear_model._logistic.LogisticRegression'>
0 : worst concave points 0.9196518987341772
1 : worst area 0.9195569620253166
2 : mean concave points 0.9172468354430379
3 : worst perimeter 0.9170886075949367
4 : worst radius 0.9145253164556962
----------------------------------------------------
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
0 : mean concave points 0.9046835443037974
1 : worst concave points 0.9046518987341774
2 : worst radius 0.8944620253164557
3 : worst area 0.889493670886076
4 : worst perimeter 0.8868354430379748
----------------------------------------------------
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
0 : worst area 0.8845886075949366
1 : worst radius 0.8819303797468354
2 : worst concave points 0.8795569620253165
3 : worst perimeter 0.8568037974683544
4 : mean perimeter 0.8543987341772151
----------------------------------------------------
<class 'sklearn.naive_baye

## Feature Importance using Recursive Feature Elimination

In [21]:
from sklearn.feature_selection import RFE

In [22]:
# We cannot run this on all models
model_list = [log_model,tree_model,rnd_model]

In [43]:
for model in model_list:
    
    print("-------------------------------------------------")
    
    rfe = RFE(estimator=model, n_features_to_select=5)
    
    print(model.__class__)
    rfe.fit(X_train, y_train)

    for i in range(0,len(rnk)):
        if rfe.support_[i] == True:
            print(feature_names[i], end="\n")
    print("")


-------------------------------------------------
<class 'sklearn.linear_model._logistic.LogisticRegression'>
mean concave points
worst radius
worst texture
worst area
worst concave points

-------------------------------------------------
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
area error
fractal dimension error
worst texture
worst area
worst concave points

-------------------------------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
mean concave points
worst radius
worst perimeter
worst area
worst concave points

