## Genetic Algorithms for feature Selection


## Libraries

In [31]:
import pandas as pd
import numpy as np
from numpy import NaN
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix,recall_score,accuracy_score,f1_score,precision_score,classification_report

## Loading the DataSet / Preprocessing


In [32]:
df = pd.read_excel("Training_Data.xlsx")

### Renaming the depression column (removing space " " in the name)

In [33]:
df = df.rename(columns={'Depression ': 'Depression'})

### Checking NaN values

In [34]:
df.isna().sum(axis = 0)

PatientId                                0
EncounterId                              0
DischargeDisposision                     0
Gender                                   0
Race                                    93
DiabetesMellitus                      3857
ChronicKidneyDisease                  3906
Anemia                                3002
Depression                            5108
ChronicObstructivePulmonaryDisease    3954
Age                                      0
ChronicDiseaseCount                      0
LengthOfStay                             0
EmergencyVisit                           0
InpatientVisit                           0
OutpatientVisit                          0
TotalVisits                              0
BMIMin                                   0
BMIMax                                   0
BMIMedian                                0
BMIMean                                  0
BPDiastolicMin                           0
BPDiastolicMax                           0
BPDiastolic

### Replacing ["Depression ", "COPD"(ChronicObstructivePulmonaryDisease), "DM"(DiabetesMellitus), "CKD"(ChronicKidneyDisease), "Anemia"] with 1("Yes") and NaN with 0("No")
as depression, ChronicObstructivePulmonaryDisease, DiabetesMellitus, ChronicKidneyDisease, Anemia are the column names so changing those values to yes and no

In [35]:
df['Depression'].replace(['Depression ', NaN],[1, 0], inplace=True)
df.ChronicObstructivePulmonaryDisease.replace(['COPD', NaN],[1, 0], inplace=True)
df.DiabetesMellitus.replace(['DM', NaN],[1, 0], inplace=True)
df.ChronicKidneyDisease.replace(['CKD', NaN],[1, 0], inplace=True)
df.Anemia.replace(['Anemia', NaN],[1, 0], inplace=True)

### Changing the NaN values in race to unknwon as unknown is already a category

In [36]:
df.Race.replace(NaN,'UnKnown', inplace=True)

### Fixing Encounter ID to int

In [37]:
values = np.array(df["EncounterId"])
for i in range(len(values)):
    if str.isnumeric(str(values[i])):
        pass
    else:
        for j in values[i]:
            if str.isalpha(j):
                values[i] = values[i].replace(j,str(ord(j)))
    values[i] = int(values[i])
    
df["EncounterId"] = values

### Taking Column Means replacing zero values

In [38]:
List = ['BMIMin', 'BMIMax', 'BMIMedian', 'BMIMean',
        'BPDiastolicMin', 'BPDiastolicMax', 'BPDiastolicMedian','BPDiastolicMean',
        'BPSystolicMin', 'BPSystolicMax', 'BPSystolicMedian','BPSystolicMean',
        'TemperatureMin', 'TemperatureMax', 'TemperatureMedian', 'TemperatureMean',
        'HeartRateMin', 'HeartRateMax','HeartRateMedian', 'HeartRateMean',
        'PulseRateMin', 'PulseRateMax','PulseRateMedian', 'PulseRateMean',
        'RespiratoryRateMin','RespiratoryRateMax', 'RespiratoryRateMedian', 'RespiratoryRateMean',
        'CardiacTroponin','Hemoglobin', 'SerumSodium', 'SerumCreatinine',
        'BNP', 'NT-proBNP']

for i in List:
    arr = np.array(df[i])
    arr = arr[arr != 0]
    M = arr.mean()
    df[i].replace(0,M, inplace=True)    


### Label Encoding

In [39]:
List = ['DischargeDisposision', 'Gender', 'Race','ReadmissionWithin_90Days']

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in List.
for i in List:
    df[i]= label_encoder.fit_transform(df[i])

## Genetic Algorithm

### Populate Chromosomes
    populating n distict Chromosomes where n = ChromoCount 

In [40]:
def populate(ChromoCount):
    List = []
    
    while(len(List) != ChromoCount):
        x = list(np.random.randint(2,size = 56,dtype = int))
        if x not in List:
            List.append(x)
    return List

### Crossover funtion 
    Creating 2 child each iteration randomly and checking if the child does not exist already then append it
    where Chromosomes is the Current Chromosomes 2D list and fill the list till no of Chorosomes become equal to ChromoCount 
   

In [41]:
def crossover(Chromosomes,ChromoCount):
    x = len(Chromosomes)      #getting parent rows
    y = len(Chromosomes[0])   #getting column count
    
    while(len(Chromosomes) < ChromoCount):
        P1 = np.random.randint(x)
        P2 = np.random.randint(x)
        while(P1 == P2):
            P2 = np.random.randint(x)
        RC = np.random.randint(1,y-1)     #  RC = crossover point
        child1 = Chromosomes[P1][:RC] + Chromosomes[P2][RC:]
        child2 = Chromosomes[P2][:RC] + Chromosomes[P1][RC:]
        if child1 not in Chromosomes:
            Chromosomes.append(child1)
        if child2 not in Chromosomes:
            Chromosomes.append(child2)

    
    return Chromosomes[:ChromoCount]

### Mutation
    inverting bit from 0 to 1 and vice versa of a random row,column

In [42]:
def Mutate(Chromosomes):
    x = len(Chromosomes)      #getting parent rows
    y = len(Chromosomes[0])   #getting column count
    row = np.random.randint(x)
    column = np.random.randint(y)
    if Chromosomes[row][column] == 1:
        Chromosomes[row][column] = 0 
    else:
        Chromosomes[row][column] = 1
        
    return Chromosomes,row

### Training 
    Receives Two arguments (X,Y) where X is the DataFrame columns on which training is to be done and Y is the 
    label(Target)

In [43]:
def train_(X,Y):
    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
    ADB = AdaBoostClassifier()
    ADB.fit(X_train, y_train)
    pred = ADB.predict(X_test)
    
    return accuracy_score(y_test,pred)*100

### GA (Genetic Algorithm)
    can recieves three arguments DF = DataFrame who's columns you desire to get, ChromoCount is the Maximum 
    number of Chromosome each iteration runs and is set to 40 by default, iteration = maximum number of 
    iterations.

In [44]:
def GA(DF, ChromoCount = 40, iterations = 300):
    CS = populate(ChromoCount)      #CS = Chromosomes
    columns = np.array(df.columns)
    columns = columns[:-1]          # [:-1] so the target column is droped
    Last5 = []
    selected = []
    accuracy = []
    maxone = [1,0,1]
    maxi = 0
    for i in range(iterations):
        for j in range(len(accuracy), len(CS)):
            x = np.array(CS[j])
            selected = columns[x==1]
            acc = train_(DF[selected],DF["ReadmissionWithin_90Days"])
            if(acc > maxi):
                maxi = acc
                maxone = CS[j]
            accuracy.append(acc)
        tempdf = pd.DataFrame({"Chromosome":CS,"Accuracy":accuracy})
        tempdf = tempdf.sort_values(by = ["Accuracy"],ascending=False)
        
        CS = list(np.array(tempdf["Chromosome"].head(ChromoCount//2)))
        accuracy = list(np.array(tempdf["Accuracy"].head(ChromoCount//2),dtype='float64'))
        
        if len(Last5) == 5:
            Last5.pop(0)
            Last5.append(accuracy[0])
        else:
            Last5.append(accuracy[0])

        if (len(Last5)==5 and np.std(Last5) <= 0.001 and i > 300) or (accuracy[0] >= 80) :
            return CS[0], accuracy[0]
        if (iterations - i) != 1:             # don't crossover/ mutate at the end of last iteration
            
            CS = crossover(CS,ChromoCount)
            threshold = 0.75
            anynum = np.random.uniform(0,1)
            if anynum > threshold:
                CS,r = Mutate(CS)
                if r < len(accuracy):
                    x = np.array(CS[r])
                    selected = columns[x==1]
                    accuracy[r] = train_(DF[selected],DF["ReadmissionWithin_90Days"])
            
    return maxone, maxi

In [45]:
%%time
CS,ACCURACY = GA(df)

CPU times: user 1h 5min 7s, sys: 45.6 s, total: 1h 5min 52s
Wall time: 1h 5min 56s


In [47]:
columns = np.array(df.columns)
columns = columns[:-1]
selected = np.array(CS)
selectedColumns = columns[selected == 1]

print(f'The Selected Columns are:\n{selectedColumns} \n Accuracy:{ACCURACY}')

The Selected Columns are:
['EncounterId' 'DischargeDisposision' 'ChronicKidneyDisease' 'Anemia'
 'Depression' 'ChronicObstructivePulmonaryDisease' 'Age'
 'ChronicDiseaseCount' 'LengthOfStay' 'EmergencyVisit' 'InpatientVisit'
 'OutpatientVisit' 'TotalVisits' 'BMIMax' 'BMIMedian' 'BPDiastolicMin'
 'BPDiastolicMedian' 'BPDiastolicMean' 'BPSystolicMin' 'BPSystolicMax'
 'BPSystolicMedian' 'BPSystolicMean' 'TemperatureMean' 'HeartRateMean'
 'PulseRateMax' 'PulseRateMedian' 'PulseRateMean' 'RespiratoryRateMedian'
 'ACEInhibitors' 'TotalMedicine' 'CardiacTroponin' 'SerumSodium' 'BNP'
 'NT-proBNP'] 
 Accuracy:75.36829699469652
