## Get all imports

In [1]:
import math
import numpy as np
import pandas as pd
from collections import Counter
from PPFS import PPIMBC
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

## Load data

In [2]:
# Load the data into pandas dataframe
df = pd.read_csv("../data/ecoli/ecoli.data", header=None)
# The first column is indices. We do not need this
df = df.drop([0], axis=1)
# Replace characters with integers
for index, i in enumerate(['imS', 'imL', 'imU', 'cp', 'im', 'omL', 'om', 'pp']):
    df[8] = df[8].str.replace(i, str(index))
df[8] = df[8].apply(pd.to_numeric)
# Check out the data
display(df.head())
# Get the data and the labels
data, Y = df.drop([8], axis=1), df[8].values
labels = [0,1,2,3,4,5,6,7]
print("Data shape: ", data.shape, "Target Variable shape: ", Y.shape)

Unnamed: 0,1,2,3,4,5,6,7,8
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,3
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,3
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,3
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,3
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,3


Data shape:  (336, 7) Target Variable shape:  (336,)


## Inner CV for DecisionTree

In [3]:
# This function is used to find the best hyper-parameters for a decision tree model
def inner_cv_dt(X, Y):
    kfold = KFold(n_splits=5, random_state=27, shuffle=True)
    scores = list()
    for train, test in kfold.split(X):
        x_train, x_test = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        
        model = DecisionTreeClassifier(random_state=27)
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = accuracy_score(y_test, preds)
        scores.append(score)
    return sum(scores)/len(scores)

## Inner CV for SVM

In [4]:
# This function is used to find the best hyper-parameters for a SVM model
def inner_cv_svm(X, Y):
    kfold = KFold(n_splits=5, random_state=27, shuffle=True)
    scores = list()
    for train, test in kfold.split(X):
        x_train, x_test = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        
        model = SVC(random_state=27)
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = accuracy_score(y_test, preds)
        scores.append(score)
    return sum(scores)/len(scores)

## Check score with decision tree

In [9]:
# 5 Fold cross validation
kfold = StratifiedKFold(n_splits=5, random_state=27, shuffle=True)
# A variable to log all the scores
scores = list()
# Run CV
for index, (train, test) in enumerate(kfold.split(data, Y)):
    # Get the data
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]
    
    # Create an object of the proposed algorithm
    model = PPIMBC(model=DecisionTreeClassifier(random_state=27, class_weight="balanced"), p_val_thresh=0.05, cv=0, num_simul=15, verbose=0, simul_type=0, simul_size=0.2, sig_test_type="non-parametric")
    # Let the algorithm find the best features. This is done only on the train fold
    x_train = model.fit_transform(x_train, y_train)
    # Prune the dataset in feature space
    x_test = model.transform(x_test)
    # Check out the size of the markov blanket (optimal set of features)
    print("Markov Blanket: ", len(model.MB))
    x_train, x_test = x_train.values, x_test.values
    
    # Now use the features to train a decision tree
    model = DecisionTreeClassifier(random_state=27)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print("Score: ", score, "\n")

# Final accuracy of the CV
print("\n\nAverage Accuracy: ", round(sum(scores)/len(scores), 3))

Markov Blanket:  6
Score:  0.7941176470588235 

Markov Blanket:  6
Score:  0.835820895522388 

Markov Blanket:  6
Score:  0.835820895522388 

Markov Blanket:  6
Score:  0.8059701492537313 

Markov Blanket:  5
Score:  0.835820895522388 



Average Accuracy:  0.822


## Check score with svm

In [8]:
# 5 Fold cross validation
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
# A variable to log all the scores
scores = list()
for index, (train, test) in enumerate(kfold.split(data)):
    # Get the data
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]   
    
    # SVM requires scaled input
    scaler = StandardScaler()
    x_train = pd.DataFrame(scaler.fit_transform(x_train.values), columns=x_train.columns, index=x_train.index)
    x_test = pd.DataFrame(scaler.transform(x_test.values), columns=x_test.columns, index=x_test.index)
    
    # Create an object of the proposed algorithm
    model = PPIMBC(model=DecisionTreeClassifier(random_state=27, class_weight="balanced"), p_val_thresh=0.05, cv=0, num_simul=30, verbose=0, simul_type=0, simul_size=0.2, sig_test_type="non-parametric")
    # Let the algorithm find the best features. This is done only on the train fold
    x_train = model.fit_transform(x_train, y_train)
    # Prune the dataset in feature space
    x_test = model.transform(x_test)
    print("Markov Blanket: ", len(model.MB))
    # Check out the size of the markov blanket (optimal set of features)
    x_train, x_test = x_train.values, x_test.values
    
    # Now use the features to train a SVM
    model = LinearSVC(random_state=27)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print("Score: ", score, "\n")
print("\n\nAverage Accuracy: ", round(sum(scores)/len(scores), 3))

Markov Blanket:  6
Score:  0.8970588235294118 

Markov Blanket:  6
Score:  0.8955223880597015 

Markov Blanket:  6
Score:  0.8507462686567164 

Markov Blanket:  6
Score:  0.835820895522388 

Markov Blanket:  6
Score:  0.8955223880597015 



Average Accuracy:  0.875
