## 1. Importing important Libraries 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

## 2. Defining function for RFE feature 

In [2]:
def rfeFeature(indep_X,dep_Y,n):
    rfelist = []
    log_model = LogisticRegression(solver = 'lbfgs')
    RF = RandomForestClassifier(n_estimators = 10,criterion='entropy',random_state=0)
    NB = GaussianNB()
    DT = DecisionTreeClassifier(criterion = 'gini', max_features='sqrt',splitter='best',random_state = 0)
    svc_model = SVC(kernel = 'linear', random_state = 0)
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    rfemodellist=[log_model,svc_model,RF,DT,NB,knn]
    for i in rfemodellist:
        print(i)
        log_rfe = RFE(estimator=i, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature=log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)
        return rfelist

## 3. Creating function for split scaler:

In [3]:
def split_scaler(indep_X,dep_Y):
    X_train,X_test,y_train,y_test = train_test_split(indep_X,dep_Y,test_size =0.25, random_state = 0)
    sc=StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train,X_test,y_train,y_test

## 4. Confusion matrix prediction:

In [4]:
def cm_prediction(classifier,X_test,y_test):
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report 
    cm = confusion_matrix(y_test,y_pred)
    Accuracy = accuracy_score(y_test,y_pred)
    report = classification_report(y_test,y_pred)
    return classifier,Accuracy,report,X_test,y_test,cm

## 5. Function for Logistic Regression:

In [5]:
def logistic(X_train,y_train,X_test,y_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train,y_train)
    classifier,Accuracy,report,X_test,y_test,cm = cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,X_test,y_test,cm

## 6. Function for support vecto machine-Linear:

In [6]:
def svm_linear(X_train,y_train,X_test,y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='linear',random_state=0)
    classifier.fit(X_train,y_train)
    classifier,Accuracy,report,X_test,y_test,cm= cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,X_test,y_test,cm 

## 7. Function for SVM Non-linear:

In [7]:
def svm_NL(X_train,y_train,X_test,y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,y_train)
    classifier,Accuracy,report,X_test,y_test,cm = cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,X_test,y_test,cm

## 8. Function for Naive Bayes:

In [8]:
def Naive(X_train,y_train,X_test,y_test):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train,y_train)
    classifier,Accuracy,report,X_test,y_test,cm = cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,X_test,y_test,cm

## 9. Function for KNN: 

In [9]:
def knn(X_train,y_train,X_test,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test,y_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

## 10. Function for Decision Tree:

In [10]:
def Decision(X_train,y_train,X_test,y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test,y_test)
    return  classifier,Accuracy,report,X_test,y_test,cm  

## 11. Function of Random Forest:

In [11]:
def random(X_train,y_train,X_test,y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators =10,criterion='entropy',random_state=0)
    classifier.fit(X_train,y_train)
    classifier,Accuracy,report,X_test,y_test,cm = cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,X_test,y_test,cm

## 12. Creating Tabluar colume for RFE:

In [12]:
def rfe_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    rfedataframe = pd.DataFrame({
        'Logistic': acclog,
        'SVMl': accsvml,
        'SVMnl': accsvmnl,
        'KNN': accknn,
        'Navie': accnav,
        'Decision': accdes,
        'Random': accrf
    },index=['Logistic', 'SVC', 'Random', 'Decision Tree'])

    return rfedataframe

## 13. Calling functions:

In [20]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2,dtype=int,drop_first=True)

indep_X=df2.drop('classification_yes', axis=1)
dep_Y=df2['classification_yes']


rfelist = rfeFeature(indep_X,dep_Y,5)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    
    X_train, X_test, y_train, y_test=split_scaler(i,dep_Y)   
   
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test,y_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test,y_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test,y_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test,y_test)  
    accknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Naive(X_train,y_train,X_test,y_test)  
    accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test,y_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test,y_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

LogisticRegression()


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
#3
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Logistic,0.94,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.94,0.94,0.94,0.94,0.94,0.94,0.94
Random,0.94,0.94,0.94,0.94,0.94,0.94,0.94
Decision Tree,0.94,0.94,0.94,0.94,0.94,0.94,0.94


In [16]:
#2
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Logistic,0.84,0.84,0.84,0.84,0.84,0.84,0.84
SVC,0.84,0.84,0.84,0.84,0.84,0.84,0.84
Random,0.84,0.84,0.84,0.84,0.84,0.84,0.84
Decision Tree,0.84,0.84,0.84,0.84,0.84,0.84,0.84


In [19]:
#4
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Logistic,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.95,0.95,0.95,0.95,0.95,0.95,0.95
Random,0.95,0.95,0.95,0.95,0.95,0.95,0.95
Decision Tree,0.95,0.95,0.95,0.95,0.95,0.95,0.95


## Conclusion: Based on the report, for feature is 2,accuracy is 84 only. If the feature is 3, accuracy is 94. Sudden increase of 10 percent higher accuacy. Also if we check with 4 features,only 1 percent increased. Finalizing 3 feature model for better accuracy.