# Feature Selection

In [0]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [0]:
import pandas as pd
import numpy as np
from scipy import stats 

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import sklearn.metrics as mx

In [0]:
prefix = 'https://raw.githubusercontent.com/ptenteromano/Machine-Learning/master/data/'

# Original data - used to get feature dictionary
originalTrainUrl = prefix + 'census-income.data.csv'

# Location of Mode data
trainModeUrl = prefix + 'training_mode.csv'
testModeUrl = prefix + 'test_mode.csv'

# Location of Knn data
trainKnnUrl = prefix + 'training_knn_Imputed.csv'
testKnnUrl = prefix + 'test_knn_Imputed.csv'

# Location of dropna data
trainDropNaUrl = prefix + 'training_dropNa.csv'
testDropNaUrl = prefix + 'test_dropNa.csv'

# Taken from .names description file
col_names = ['Age','Workclass','FinalWeight','Education','EducationNum','MaritalStatus','Occupation','Relationship','Race',
         'Sex','CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Label']

# Original data
trainOg = pd.read_csv(originalTrainUrl, names=col_names, header=None)

# Mode data
trainMode = pd.read_csv(trainModeUrl, index_col=0)
testMode = pd.read_csv(testModeUrl, index_col=0)

# Knn data
trainKNN = pd.read_csv(trainKnnUrl, index_col=0)
testKNN = pd.read_csv(testKnnUrl, index_col=0)

# Dropna data
trainDropNa = pd.read_csv(trainDropNaUrl, index_col=0)
testDropNa = pd.read_csv(testDropNaUrl, index_col=0)

print(len(trainMode) == len(trainKNN))
print(len(testMode) == len(testKNN))

True
True


In [0]:
# Split data into features and labels
def getX(df):
    return df.iloc[:,:-1]

def getLabel(df):
    return df.iloc[:,-1]

In [0]:
# TRAINING
# Get feature data
trainingOg_X = getX(trainOg)
trainingMode_X = getX(trainMode)
trainingKnn_X = getX(trainKNN)
trainingDropNa_X = getX(trainDropNa)

# Get labels
trainingOg_Label = getLabel(trainOg)
trainingMode_Label = getLabel(trainMode)
trainingKnn_Label = getLabel(trainKNN)
trainingDropNa_Label = getLabel(trainDropNa)

# TESTING
# Get feature data
testMode_X = getX(testMode)
testKnn_X = getX(testKNN)
testDropNa_X = getX(testDropNa)

# Get labels
testMode_Label = getLabel(testMode)
testKnn_Label = getLabel(testKNN)
testDropNa_Label = getLabel(testDropNa)

# Combine as unpackable lists to easily pass to functions
# Stored as: [ Training Features, Training Label, testing Features, testing Label ]
dataMode = [trainingMode_X, trainingMode_Label, testMode_X, testMode_Label]
dataKnn = [ trainingKnn_X, trainingKnn_Label, testKnn_X, testKnn_Label ]
dataDropNa = [ trainingDropNa_X, trainingDropNa_Label, testDropNa_X, testDropNa_Label ]

In [0]:
# Get Feature Dictionary - Mapping of Features -> new binary Feature columns
def getFeatureDict(df):
    # Get different types of data
    num_attr = list(df.select_dtypes(include=['int']))
    cat_attr = list(df.select_dtypes(include=['object']))
    
    # Prune the categorical values - 'Education is redudant'
    for i,c in enumerate(cat_attr):
        if 'Education' in c or 'Label' in c:
            del cat_attr[i]
    
    # Prune Label just in case
    for i,c in enumerate(num_attr):
         if 'Label' in c:
            del num_attr[i]
    
    # Init dictionary
    featDict = {f: -1 for f in cat_attr}

    # Map feature values to feature
    for f in cat_attr:
        featDict[f] = df[f].unique()
    
    # Remove bad feature values
    for f in featDict:
        for i,c in enumerate(featDict[f]):
            try:
                df[c].loc[0]
            except: 
                if '?' in c:
                    featDict[f] = np.delete(featDict[f], i)
    
    # Add numeric values as 1-1 map                    
    for f in num_attr:
        featDict[f] = f

    return featDict

In [0]:
# A dict of original { Features: FeatVals } 
featDict = getFeatureDict(trainOg)

In [0]:
# Show the Number of values corresponding to the old categorical features
for f in featDict:
    if type(featDict[f]) is not str:
        print(f, len(featDict[f]))
    else:
        print(f, ' - Continuous')

Workclass 8
MaritalStatus 7
Occupation 14
Relationship 6
Race 5
Sex 2
NativeCountry 41
Age  - Continuous
FinalWeight  - Continuous
EducationNum  - Continuous
CapitalGain  - Continuous
CapitalLoss  - Continuous
HoursPerWeek  - Continuous


### Classification Algorithms

In [0]:
def KNN(trainingX, trainingY, testX):
    knn= BaggingClassifier(base_estimator= KNeighborsClassifier())
    knn.fit(trainingX, trainingY)
    resultY= knn.predict(testX)
    return resultY

def LoReg(trainingX, trainingY, testX):
    loreg= BaggingClassifier(base_estimator= LogisticRegression())
    loreg.fit(trainingX, trainingY)
    resultY= loreg.predict(testX)
    return resultY

def RForest(trainingX, trainingY, testX):
    rf= BaggingClassifier(base_estimator= RandomForestClassifier())
    rf.fit(trainingX, trainingY)
    resultY= rf.predict(testX)
    return resultY

def SVM(trainingX, trainingY, testX):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        svm = BaggingClassifier(base_estimator = LinearSVC(max_iter=500))
        svm.fit(trainingX, trainingY)
        resultY= svm.predict(testX)
    return resultY


### Accuracy Functions

In [0]:
def knnAccuracy(trainingX, trainingY, testX, testY):
    knntest = KNN(trainingX, trainingY, testX)
    return mx.accuracy_score(knntest, testY)

In [0]:
def loRegAccuracy(trainingX, trainingY, testX, testY):
    loregtest = LoReg(trainingX, trainingY, testX)
    return mx.accuracy_score(loregtest, testY)

In [0]:
def rfAccuracy(trainingX, trainingY, testX, testY):
    rf = RForest(trainingX, trainingY, testX)
    return mx.accuracy_score(rf, testY)

In [0]:
def svmAccuracy(trainingX, trainingY, testX, testY):
    svmtest = SVM(trainingX, trainingY, testX)
    return mx.accuracy_score(svmtest, testY)

### Test on full datasets

In [0]:
print("KNN --> Mode Imputed: " + str(knnAccuracy(*dataMode)))
print("KNN --> Algo Imputed: " + str(knnAccuracy(*dataKnn)))
print("KNN --> Drop NA: " + str(knnAccuracy(*dataDropNa)))

KNN --> Mode Imputed: 0.7794361525704809
KNN --> Algo Imputed: 0.7782691480867269
KNN --> Drop NA: 0.6810647915620291


In [0]:
print("LoReg --> Mode Imputed: " + str(loRegAccuracy(*dataMode)))
print("LoReg --> Algo Imputed: " + str(loRegAccuracy(*dataKnn)))
print("LoReg --> Drop NA: " + str(loRegAccuracy(*dataDropNa)))

LoReg --> Mode Imputed: 0.7996437565260119
LoReg --> Algo Imputed: 0.8000122842577237
LoReg --> Drop NA: 0.7002941809571643


In [0]:
print("Rand Forest --> Mode Imputed: " + str(rfAccuracy(*dataMode)))
print("Rand Forest --> Algo Imputed: " + str(rfAccuracy(*dataKnn)))
print("Rand Forest --> Drop NA: " + str(rfAccuracy(*dataDropNa)))

Rand Forest --> Mode Imputed: 0.8562741846323936
Rand Forest --> Algo Imputed: 0.857379767827529
Rand Forest --> Drop NA: 0.7449235847025902


In [0]:
print("SVM --> Mode Imputed: " + str(svmAccuracy(*dataMode)))
print("SVM --> Algo Imputed: " + str(svmAccuracy(*dataKnn)))
print("SVM --> Drop NA: " + str(svmAccuracy(*dataDropNa)))

SVM --> Mode Imputed: 0.7945457895706652
SVM --> Algo Imputed: 0.7856397027209631
SVM --> Drop NA: 0.7327975891511803


# Feature Selection Algorithm

In [0]:
# accuracyMethod:(featureData, classData) => number
def featureSelection(trainingX, trainingY, testX, testY, featureDict, accuracyMethod):
    
    # Best accuracy found
    bestAccuracyAllTime = 0
    
    # KEYS in featDict - the actual feature
    selectedFeatureLabels = []
    
    # VALUES in featDict - the tested feature values (which are columns of binary features)
    selectedFeatureColumns = []
    
    iters = 0 
    # Assure the algorithm does not select more features than there are available
    while len(selectedFeatureColumns) < len(trainingX.columns):

        # Best accuracy for the current iteration
        bestAccuracy = 0
        
        # Best feature KEYS for the current iteration
        bestFeatureLabels = []
        
        # Best feature VALUES for the current iteration
        bestFeatureColumns = []
        
        iters += 1
        
        # Loop through all features
        for f in featureDict:
            
            if f in selectedFeatureLabels:
                # We've already added this feature to our selected list of features
                continue
                
            # Features we're working with / testing
            featureLabels = [ s for s in selectedFeatureLabels ]
            featureLabels.append(f)
            
            # Get columns corresponding to that feature
            if type(featureDict[f]) == str:
                featureCols = [ featureDict[f] ]
            else:
                featureCols = list(featureDict[f])
            
            # Append what we already have to the test columns
            for col in selectedFeatureColumns:
                featureCols.append(col)
            
            # Store temporary feature labels
            subsetTrainingX = trainingX[featureCols]
            subsetTestingX = testX[featureCols]
            
            # Run the algorithm
            accuracy = accuracyMethod(subsetTrainingX, trainingY, subsetTestingX, testY)
            # print('\t' + str(f) + ' ' + str(accuracy))
            
            # If the accuracy is the best accuracy from this iteration, update it
            if accuracy > bestAccuracy:
                bestAccuracy = accuracy
                bestFeatureLabels = featureLabels
                bestFeatureColumns = featureCols
            
        # Update the best accuracy and selected features with the best from this batch 
        if bestAccuracy > bestAccuracyAllTime:
            bestAccuracyAllTime = bestAccuracy
            selectedFeatureLabels = bestFeatureLabels
            selectedFeatureColumns = bestFeatureColumns
            print(iters, round(bestAccuracyAllTime, 4), selectedFeatureLabels)
            
        else:
            # We've reached the best features we could have gotten
            break
    
    # Return feature set and accuracy
    return (selectedFeatureLabels, bestAccuracyAllTime)

## Callable functions - FS

In [0]:
# KNN feature Selection
def knnFeatureSelection(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using knn
        return knnAccuracy(trainingX, trainingY, testX, testY)
    
    return featureSelection(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# Logistic Regression feature Selection
def loRegFeatureSelection(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using loReg
        return loRegAccuracy(trainingX, trainingY, testX, testY)
    
    return featureSelection(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# Random Forest feature Selection
def rfFeatureSelection(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using random forest
        return rfAccuracy(trainingX, trainingY, testX, testY)
    
    return featureSelection(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# SVM feature Selection
def svmFeatureSelection(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using knn
        return svmAccuracy(trainingX, trainingY, testX, testY)
    
    return featureSelection(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

### Running Feature Selection

In [0]:
# LoReg Feature Selection using the data imputed with knn values
loregSetFS_dataKnn, loregAccuracyFS_dataKnn = loRegFeatureSelection(*dataKnn, featDict)
print("Knn-Done\n")
loregSetFS_dataMode, loregAccuracyFS_dataMode = loRegFeatureSelection(*dataMode, featDict)
print("Mode-Done\n")
loregSetFS_dataDropNa, loregAccuracyFS_dataDropNa = loRegFeatureSelection(*dataDropNa, featDict)
print("DropNA-Done\n")

1 0.8013 ['CapitalGain']
2 0.8121 ['CapitalGain', 'CapitalLoss']
3 0.8121 ['CapitalGain', 'CapitalLoss', 'NativeCountry']
Done

1 0.8007 ['CapitalGain']
2 0.8098 ['CapitalGain', 'CapitalLoss']
3 0.8105 ['CapitalGain', 'CapitalLoss', 'Relationship']
4 0.8433 ['CapitalGain', 'CapitalLoss', 'Relationship', 'EducationNum']
5 0.8498 ['CapitalGain', 'CapitalLoss', 'Relationship', 'EducationNum', 'Occupation']
6 0.8512 ['CapitalGain', 'CapitalLoss', 'Relationship', 'EducationNum', 'Occupation', 'Workclass']
7 0.8519 ['CapitalGain', 'CapitalLoss', 'Relationship', 'EducationNum', 'Occupation', 'Workclass', 'Race']
Done

1 0.7563 ['Workclass']
2 0.7673 ['Workclass', 'Relationship']
3 0.815 ['Workclass', 'Relationship', 'Occupation']
4 0.8167 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry']
5 0.8173 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry', 'Race']
6 0.8178 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry', 'Race', 'MaritalStatus']
Done



In [0]:
# Random Forest Feature Selection using the data imputed with knn values
rfSetFS_dataKnn, rfAccuracyFS_dataKnn = rfFeatureSelection(*dataKnn, featDict)
print("Knn-Done\n")
rfSetFS_dataMode, rfAccuracyFS_dataMode = rfFeatureSelection(*dataMode, featDict)
print("Mode-Done\n")
rfSetFS_dataDropNa, rfAccuracyFS_dataDropNa = rfFeatureSelection(*dataDropNa, featDict)
print("DropNa-Done\n")

1 0.813 ['CapitalGain']
2 0.8342 ['CapitalGain', 'CapitalLoss']
3 0.8364 ['CapitalGain', 'CapitalLoss', 'EducationNum']
4 0.8605 ['CapitalGain', 'CapitalLoss', 'EducationNum', 'MaritalStatus']
Done

1 0.8133 ['CapitalGain']
2 0.8339 ['CapitalGain', 'CapitalLoss']
3 0.8364 ['CapitalGain', 'CapitalLoss', 'EducationNum']
4 0.8607 ['CapitalGain', 'CapitalLoss', 'EducationNum', 'MaritalStatus']
Done

1 0.7563 ['Workclass']
2 0.7668 ['Workclass', 'Relationship']
3 0.8195 ['Workclass', 'Relationship', 'Occupation']
4 0.8256 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry']
5 0.829 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry', 'Race']
6 0.8299 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry', 'Race', 'MaritalStatus']
7 0.8314 ['Workclass', 'Relationship', 'Occupation', 'NativeCountry', 'Race', 'MaritalStatus', 'Sex']
Done



In [0]:
# SVM Feature Selection using the data imputed with knn values
svmSetFS_dataKnn, svmAccuracyFS_dataKnn = svmFeatureSelection(*dataKnn, featDict)
print("Knn-Done\n")
svmSetFS_dataMode, svmAccuracyFS_dataMode = svmFeatureSelection(*dataMode, featDict)
print("Mode-Done\n")
svmSetFS_dataDropNa, svmAccuracyFS_dataDropNa = svmFeatureSelection(*dataDropNa, featDict)
print("DropNa-Done\n")

1 0.7951 ['CapitalGain']
2 0.7991 ['CapitalGain', 'NativeCountry']
3 0.8015 ['CapitalGain', 'NativeCountry', 'HoursPerWeek']
Knn-Done

1 0.8023 ['CapitalGain']
Mode-Done

1 0.7563 ['Workclass']
2 0.7671 ['Workclass', 'Relationship']
3 0.8134 ['Workclass', 'Relationship', 'Occupation']
4 0.8159 ['Workclass', 'Relationship', 'Occupation', 'Race']
5 0.8173 ['Workclass', 'Relationship', 'Occupation', 'Race', 'NativeCountry']
DropNa-Done



In [0]:
# KNN Feature Selection using the data imputed with knn values
knnSetFS_dataKnn, knnAccuracyFS_dataKnn = knnFeatureSelection(*dataKnn, featDict)
print("Knn-Done\n")
knnSetFS_dataMode, knnAccuracyFS_dataMode = knnFeatureSelection(*dataMode, featDict)
print("Mode-Done\n")
knnSetFS_dataDropNa, knnAccuracyFS_dataDropNa = knnFeatureSelection(*dataDropNa, featDict)
print("DropNa-Done\n")

1 0.813 ['CapitalGain']
2 0.8335 ['CapitalGain', 'CapitalLoss']
3 0.8339 ['CapitalGain', 'CapitalLoss', 'Relationship']


# Feature Removal Algorithm

In [0]:
# accuracyMethod:(featureData, classData) => number
def featureRemoval(trainingX, trainingY, testX, testY, featureDict, accuracyMethod):
    
    # Best accuracy found
    bestAccuracyAllTime = 0
        
    # KEYS in featDict - the actual feature
    selectedFeatureLabels = list(featureDict.keys())
    
    # VALUES in featDict - the tested feature values (which are columns of binary features)
    selectedFeatureColumns = list(trainingX.columns)

    # Assure the algorithm does not select more features than there are available
    while len(selectedFeatureLabels) > 0:

        # Best accuracy for the current iteration
        bestAccuracy = 0
        
        # Best features KEYS for the current iteration
        bestFeatureLabels = selectedFeatureLabels
        
        # Best features VALUES for the current iteration
        bestFeatureColumns = selectedFeatureColumns
        
        # Loop through all features
        for f in featureDict:
            
            if f not in selectedFeatureLabels:
                # We've already removed this feature
                continue
                
            # Features we're working with / testing - remove f from labels
            featureLabels = [ s for s in selectedFeatureLabels if s is not f]
                                
            # Remove this feature from the test columns
            for col in selectedFeatureColumns:
                if col not in featDict[f]:
                    featureCols.append(col)
            
            # Store temporary feature labels
            subsetTrainingX = trainingX[featureCols]
            subsetTestingX = testX[featureCols]
            
            # Run accuracy test
            accuracy = accuracyMethod(subsetTrainingX, trainingY, subsetTestingX, testY)
            print('\t' + str(f) + ' ' + str(accuracy))
            
            # If the accuracy is the best accuracy from this iteration, update it
            if accuracy > bestAccuracy:
                bestAccuracy = accuracy
                bestFeatureLabels = featureLabels
                bestFeatureColumns = featureCols
            
        # Update the best accuracy and selected features with the best from this batch 
        if bestAccuracy > bestAccuracyAllTime:
            bestAccuracyAllTime = bestAccuracy
            selectedFeatureLabels = bestFeatureLabels
            selectedFeatureColumns = bestFeatureColumns
            print(round(bestAccuracyAllTime, 4), selectedFeatureLabels)
            
        else:
            # We've reached the best features we could have gotten
            break
    
    return (selectedFeatureLabels, bestAccuracyAllTime)

## Callable functions - FR

In [0]:
# KNN feature Removal
def knnFeatureRemoval(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using knn
        return knnAccuracy(trainingX, trainingY, testX, testY)
    
    return featureRemoval(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# Logistic Regression feature Removal
def loRegFeatureRemoval(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using loReg
        return loRegAccuracy(trainingX, trainingY, testX, testY)
    
    return featureRemoval(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# Random Forest feature Removal
def rfFeatureRemoval(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using random forest
        return rfAccuracy(trainingX, trainingY, testX, testY)
    
    return featureRemoval(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

In [0]:
# SVM feature Removal
def svmFeatureRemoval(trainingX, trainingY, testX, testY, featDict):
    def accuracyMethod(trainingX, trainingY, testX, testY):

        # Return accuracy using knn
        return svmAccuracy(trainingX, trainingY, testX, testY)
    
    return featureRemoval(trainingX, trainingY, testX, testY, featDict, accuracyMethod)

### Running Feature Removal

In [0]:
# LoReg Feature Removal using the data imputed with knn values
loregSetFR_dataKnn, loregAccuracyFR_dataKnn = loRegFeatureRemoval(*dataKnn, featDict)
print("Knn-Done\n")
loregSetFR_dataMode, loregAccuracyFR_dataMode = loRegFeatureRemoval(*dataMode, featDict)
print("Mode-Done\n")
loregSetFR_dataDropNa, loregAccuracyFR_dataDropNa = loRegFeatureRemoval(*dataDropNa, featDict)
print("DropNA-Done\n")

In [0]:
# Random Forest Feature Removal using the data imputed with knn values
rfSetFR_dataKnn, rfAccuracyFR_dataKnn = rfFeatureRemoval(*dataKnn, featDict)
print("Knn-Done\n")
rfSetFR_dataMode, rfAccuracyFR_dataMode = rfFeatureRemoval(*dataMode, featDict)
print("Mode-Done\n")
rfSetFR_dataDropNa, rfAccuracyFR_dataDropNa = rfFeatureRemoval(*dataDropNa, featDict)
print("DropNA-Done\n")

In [0]:
# SVM Feature Removal using the data imputed with knn values
svmSetFR_dataKnn, svmAccuracyFR_dataKnn = svmFeatureRemoval(*dataKnn, featDict)
print("Knn-Done\n")
svmSetFR_dataMode, svmAccuracyFR_dataMode = svmFeatureRemoval(*dataMode, featDict)
print("Mode-Done\n")
svmSetFR_dataDropNa, svmAccuracyFR_dataDropNa = svmFeatureRemoval(*dataDropNa, featDict)
print("DropNA-Done\n")

In [0]:
# KNN Feature Removal using the data imputed with knn values
knnSetFR_dataKnn, knnAccuracyFR_dataKnn = knnFeatureRemoval(*dataKnn, featDict)
print("Knn-Done\n")
knnSetFR_dataMode, knnAccuracyFR_dataMode = knnFeatureRemoval(*dataMode, featDict)
print("Mode-Done\n")
knnSetFR_dataDropNa, knnAccuracyFR_dataDropNa = knnFeatureRemoval(*dataDropNa, featDict)
print("DropNA-Done\n")