### Import Libraries

In [1]:
import pandas as pd
import time
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

### Function to Implement Recursive Feature Elimination for all Algorithms

In [2]:
def rfeFeature(indep_X,dep_Y,n):
    rfe_list=[]
    log_model=LogisticRegression(solver='lbfgs')
    RF=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    DT=DecisionTreeClassifier(criterion='gini',max_features='sqrt',splitter='best',random_state=0)
    svc_model=SVC(kernel='linear',random_state=0)
    rfeModelList=[log_model,RF,DT,svc_model]
    for i in rfeModelList:
        print(i)
        log_rfe=RFE(i,n)
        log_fit=log_rfe.fit(indep_X,dep_Y)
        log_rfe_feature=log_fit.transform(indep_X)
        rfe_list.append(log_rfe_feature)
    return rfe_list

### Function to Split Train - Test Set and Standard Scaling the input

In [3]:
def split_scale(indep_X,dep_Y):
    X_train,X_test,Y_train,Y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

### Function to Generate Metric Reports: Confusion Matrix, Classification Report & Accuracy Score

In [4]:
def cm_predict(classifier,X_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(Y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(Y_test,y_pred)
    report=classification_report(Y_test,y_pred)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Logistic Regression Function

In [5]:
def logistic(X_train,Y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### SVM Linear Function

In [6]:
def svm_linear(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### SVM Non-Linear Function

In [7]:
def svm_NL(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Naive Bayes Function

In [8]:
def Naive(X_train,Y_train,X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### KNN Function

In [9]:
def knn(X_train,Y_train,X_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Decision Tree Function

In [10]:
def Decision(X_train,Y_train,X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Random Forest Function

In [11]:
def random(X_train,Y_train,X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_predict(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

### Function to create Data Frame

In [12]:
def rfe_classification(alog,asvml,asvmnl,aknn,anav,adt,arf):
    rfedataframe=pd.DataFrame(index=['Logistic','SVC','RandomForest','DecisionTree'],columns=['Logistic','SVML','SVMNL','KNN','NB','DecisionTree','RandomForest'])
    for number,idex in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][idex]=alog[number]
        rfedataframe['SVML'][idex]=asvml[number]
        rfedataframe['SVMNL'][idex]=asvmnl[number]
        rfedataframe['KNN'][idex]=aknn[number]
        rfedataframe['NB'][idex]=anav[number]
        rfedataframe['DecisionTree'][idex]=adt[number]
        rfedataframe['RandomForest'][idex]=arf[number]
    return rfedataframe

### Dataset Execution

In [24]:
dataset=pd.read_csv('prep.csv',index_col=None)
df=dataset
df=pd.get_dummies(df,drop_first=True)
indep_X=df.drop('classification_yes',1)
dep_Y=df['classification_yes']
rfe_list=rfeFeature(indep_X,dep_Y,7)
alog=[]
asvml=[]
asvmnl=[]
aknn=[]
anav=[]
adt=[]
arf=[]
for i in rfe_list:
    X_train,X_test,Y_train,Y_test=split_scale(i,dep_Y)
    
    classifier,Accuracy,report,X_test,Y_test,cm=logistic(X_train,Y_train,X_test)
    alog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=svm_linear(X_train,Y_train,X_test)
    asvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=svm_NL(X_train,Y_train,X_test)
    asvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=knn(X_train,Y_train,X_test)
    aknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=Naive(X_train,Y_train,X_test)
    anav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=Decision(X_train,Y_train,X_test)
    adt.append(Accuracy)
    
    classifier,Accuracy,report,X_test,Y_test,cm=random(X_train,Y_train,X_test)
    arf.append(Accuracy)

result=rfe_classification(alog,asvml,asvmnl,aknn,anav,adt,arf)
result

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)


  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.98,0.98,0.98,0.96,0.98,0.99,0.98
SVC,0.99,0.98,0.98,0.98,0.91,0.96,0.96
RandomForest,0.93,0.96,0.96,0.94,0.88,0.97,0.95
DecisionTree,0.99,0.99,0.99,0.99,0.99,1.0,0.99


In [17]:
result
# for n=3

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.94,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.94,0.94,0.94,0.94,0.9,0.91,0.92
RandomForest,0.97,0.98,0.98,0.98,0.79,0.97,0.97
DecisionTree,0.87,0.87,0.87,0.87,0.87,0.87,0.87


In [19]:
result
# for n=4

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.97,0.97,0.97,0.97,0.87,0.95,0.97
RandomForest,0.91,0.92,0.92,0.98,0.81,0.98,0.98
DecisionTree,0.96,0.96,0.96,0.96,0.96,0.96,0.96


In [21]:
result
# for n=5

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.98,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.97,0.98,0.98,0.97,0.91,0.96,0.98
RandomForest,0.92,0.93,0.93,0.94,0.85,0.97,0.98
DecisionTree,0.99,0.99,0.99,0.99,0.99,0.99,0.99


In [23]:
result
# for n=6

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.98,0.98,0.98,0.98,0.98,0.99,0.98
SVC,0.97,0.99,0.99,0.96,0.92,0.95,0.98
RandomForest,0.96,0.97,0.97,0.94,0.85,0.97,0.96
DecisionTree,0.99,0.99,0.99,0.99,0.99,0.99,0.99


In [25]:
result
# for n=7

Unnamed: 0,Logistic,SVML,SVMNL,KNN,NB,DecisionTree,RandomForest
Logistic,0.98,0.98,0.98,0.96,0.98,0.99,0.98
SVC,0.99,0.98,0.98,0.98,0.91,0.96,0.96
RandomForest,0.93,0.96,0.96,0.94,0.88,0.97,0.95
DecisionTree,0.99,0.99,0.99,0.99,0.99,1.0,0.99
