### Import Libraries

In [1]:
import pandas as pd
import time
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance as pfi

### Function to Implement Permutation Feature Importance for Tree Based Algorithms

In [2]:
def PFI(indep_X,dep_Y,n):
    PFI_List=[]
    RF=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    DT=DecisionTreeClassifier(criterion='gini',max_features='sqrt',splitter='best',random_state=0)
    RFO=RF.fit(indep_X,dep_Y)
    DTO=DT.fit(indep_X,dep_Y)
    PFIModelList=[DTO,RFO]
    for i in PFIModelList:
        #feature_names = indep_X.columns
        #importances = i.feature_importances_
        #indices = np.argsort(importances)[::-1]
        
        #names = [feature_names[j] for j in indices]
        #print(names)
        #plt.figure(figsize=(10, 6))
        #plt.title("Feature Importances")
        #plt.bar(range(indep_X.shape[1]), importances[indices])
        #plt.xticks(range(indep_X.shape[1]), names, rotation=90)
        #plt.xlabel("Features")
        #plt.ylabel("Importance")
        #plt.show()
        
        result = pfi(i, indep_X, dep_Y, n_repeats=30, scoring='accuracy', random_state=0, n_jobs=1)
        feature_names = indep_X.columns
        importances = result.importances
        sorted_idx = result.importances_mean.argsort()[::-1]
        names = [feature_names[j] for j in sorted_idx]
        num_features_to_select = n
        selected_feature_names = names[:num_features_to_select]
        print(selected_feature_names)
        PFI_List.append(selected_feature_names)
        
        #feature_importances = result.importances_mean[sorted_idx]
        #feature_std = result.importances_std[sorted_idx]
        #feature_names = [f"feature_{i}" for i in sorted_idx]
        #num_features_to_select = n
        #selected_feature_indices = sorted_idx[:num_features_to_select]
        #selected_feature_names = feature_names[:num_features_to_select]
        #print(f"Top {num_features_to_select} features based on permutation importance:")
        #mean=result.importances_mean
        #std=result.importances_std
        #print(names)
        #print(std)
        #sorted_idx = np.argsort(result.importances_mean)[::-1]
        #feature_importance_df_perm = pd.DataFrame({'feature': feature_names[sorted_idx],'importance_mean': result.importances_mean[sorted_idx],'importance_std': result.importances_std[sorted_idx]})
        #print(feature_importance_df_perm)
    return PFI_List

### Function to Split Train - Test Set and Standard Scaling the input

In [3]:
def split_scale(indep_X,dep_Y):
    X_train,X_test,Y_train,Y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

### Function to Generate Metric Reports: Confusion Matrix, Classification Report & Accuracy Score

In [4]:
def cm_predict(classifier,X_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(Y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(Y_test,y_pred)
    report=classification_report(Y_test,y_pred)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Logistic Regression Function

In [5]:
def logistic(X_train,Y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### SVM Linear Function

In [6]:
def svm_linear(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### SVM Non-Linear Function

In [7]:
def svm_NL(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Naive Bayes Function

In [8]:
def Naive(X_train,Y_train,X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### KNN Function

In [9]:
def knn(X_train,Y_train,X_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Decision Tree Function

In [10]:
def Decision(X_train,Y_train,X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Random Forest Function

In [11]:
def random(X_train,Y_train,X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Function to create Data Frame

In [12]:
def PFI_classification(alog,asvml,asvmnl,aknn,anav,adt,arf):
    PFIdataframe=pd.DataFrame(index=['DecisionTree','RandomForest'],columns=['Logistic','SVML','SVMNL','KNN','NB','DecisionTree','RandomForest'])
    for number,idex in enumerate(PFIdataframe.index):
        PFIdataframe['Logistic'][idex]=alog[number]
        PFIdataframe['SVML'][idex]=asvml[number]
        PFIdataframe['SVMNL'][idex]=asvmnl[number]
        PFIdataframe['KNN'][idex]=aknn[number]
        PFIdataframe['NB'][idex]=anav[number]
        PFIdataframe['DecisionTree'][idex]=adt[number]
        PFIdataframe['RandomForest'][idex]=arf[number]
    return PFIdataframe

### Dataset Execution

In [24]:
dataset=pd.read_csv('prep.csv',index_col=None)
df=dataset
df=pd.get_dummies(df,drop_first=True)
indep_X=df.drop('classification_yes',1)
dep_Y=df['classification_yes']
PFI_df=PFI(indep_X,dep_Y,7)
alog=[]
asvml=[]
asvmnl=[]
aknn=[]
anav=[]
adt=[]
arf=[]
PFI_0=PFI_df[0]
PFI_1=PFI_df[1]
Final=(PFI_0,PFI_1)
for i in Final:
    X=indep_X[i]
    X_train,X_test,Y_train,Y_test=split_scale(X,dep_Y)

    classifier,Accuracy,report,X_test,Y_test,cm=logistic(X_train,Y_train,X_test)
    alog.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=svm_linear(X_train,Y_train,X_test)
    asvml.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=svm_NL(X_train,Y_train,X_test)
    asvmnl.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=knn(X_train,Y_train,X_test)
    aknn.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=Naive(X_train,Y_train,X_test)
    anav.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=Decision(X_train,Y_train,X_test)
    adt.append(Accuracy)

    classifier,Accuracy,report,X_test,Y_test,cm=random(X_train,Y_train,X_test)
    arf.append(Accuracy)

result=PFI_classification(alog,asvml,asvmnl,aknn,anav,adt,arf) 
result


['sc', 'hrmo', 'bgr', 'pcv', 'bu', 'rc', 'htn_yes']
['pcv', 'sc', 'bgr', 'hrmo', 'sod', 'age', 'bu']


Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.97,0.97,0.95,0.81,0.95,0.96
RandomForest,0.95,0.97,0.97,0.96,0.84,0.94,0.97


In [17]:
result
# for n=3

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.96,0.96,0.96,0.85,0.94,0.96
RandomForest,0.94,0.94,0.94,0.95,0.84,0.95,0.96


In [19]:
result
# for n=4

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.96,0.96,0.95,0.88,0.96,0.94
RandomForest,0.95,0.96,0.96,0.95,0.88,0.98,0.96


In [21]:
result
# for n=5

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.95,0.95,0.96,0.87,0.95,0.97
RandomForest,0.96,0.96,0.96,0.98,0.88,0.94,0.97


In [23]:
result
# for n=6

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.96,0.96,0.93,0.87,0.96,0.97
RandomForest,0.96,0.97,0.97,0.96,0.88,0.98,0.98


In [25]:
result
# for n=7

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
DecisionTree,0.95,0.97,0.97,0.95,0.81,0.95,0.96
RandomForest,0.95,0.97,0.97,0.96,0.84,0.94,0.97
