# imported necessary libraries

In [None]:
import numpy as np
import pandas as pd
from random import randint
import random
from typing import List, Tuple

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# Breast Cancer Dataset



---


*   read and splitted the dataset from pandas dataframe
*   took list of classifiers for performance analysis
*   size of dataset after splitting is shown


---




In [None]:
df = pd.read_csv("/content/drive/MyDrive/test_data/data.csv")
y = df["diagnosis"]
y = np.where(y == 'M',1,0)
df.drop(["id", "diagnosis", "Unnamed: 32"], axis = 1, inplace = True)

classifiers = ['LinearSVM', 'RadialSVM', 
                'Logistic',  'RandomForest',
                'NaiveBayes', 'DecisionTree',
                'knn']
    
models_list = [svm.SVC(kernel='linear'),
               svm.SVC(kernel='rbf'),
               LogisticRegression(max_iter = 4000),
               RandomForestClassifier(n_estimators = 200, random_state = 0),
               GaussianNB(),
               DecisionTreeClassifier(random_state = 0),
               KNeighborsClassifier()]

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.20, random_state = 42)

print(f'Total X_train data : {X_train.shape}\n')
print(f'Total X_test data : {X_test.shape}\n')
print(f'Total y_train data : {y_train.shape}\n')
print(f'Total y_test data : {y_test.shape}')

Total X_train data : (455, 30)

Total X_test data : (114, 30)

Total y_train data : (455,)

Total y_test data : (114,)


# Probabilty Vector Function

---


*   creates an empty array of size equivalent to number of features
*   fills the array cells initially by 0.5




---



In [None]:
def probability_vector(no_of_features):

    prob = np.empty(no_of_features) 
    prob.fill(0.5)
    
    return list(prob)

# Initial population function



---


*   size parameter indicate size of population.
*   no_of_features indicate number of features present in dataset.
*   almost 20% of features are seleced as false randomly whereas remaining 80% remained true.
*   returns a list of boolean population of desired size.



---

In [None]:
def initial_population(size: int, no_of_features: int) -> list:

    population = list()
    array_index = int(no_of_features*0.3)

    for i in range(size):
        chromosome = np.ones(no_of_features, dtype = bool)     
        chromosome[:array_index] = False
        np.random.shuffle(chromosome)
        population.append(chromosome)

    return population

# Fitness Function



---


*   calculates fitness of an individual of a population by accuracy_score
*   best fitted individual declared as winner while the least as loser


---




In [None]:
def fitness_function(population: list, model: any) -> Tuple[list, list]:

    scores_list = list()
    fitted_population = list()

    for chromosome in population:
        model.fit(X_train.iloc[:, chromosome], y_train)
        predicted = model.predict(X_test.iloc[:, chromosome])
        scores_list.append(accuracy_score(y_test, predicted))

    if scores_list[0] > scores_list[1]:
        winner = population[0]
        loser = population[1]
    else:
        winner = population[1] 
        loser = population[0]

    return winner, loser

# Functin to update probability vector

In [None]:
def probability_vector_updater(winner, loser, prob, no_of_features, Total_population):

    updated_prob = list()

    for i in range(no_of_features):
        if(winner[i] != loser[i]):
            if winner[i] == True:
                if prob[i] >= 1:
                    pass
                else:
                    prob[i] = prob[i] + 1/Total_population
            else:
                if prob[i] <= 0:
                    pass
                else:
                    prob[i] = prob[i] - 1/Total_population
        updated_prob.append(prob[i])

    up = np.array(updated_prob)

    return up

 # Function to iterate and update the probabilities present in probability vector 

In [None]:
def probability_vector_iterator(no_of_features, iteration, total_population):

    prob = probability_vector(no_of_features)
    population = initial_population(total_population, no_of_features)

    for _ in range(iteration):
        list_of_strings = list()
        index = np.random.randint(0, total_population - 1)
        list_of_strings.append(population[index])
        list_of_strings.append(population[index + 1])
        winner, loser = fitness_function(list_of_strings, models_list[5])
        prob = probability_vector_updater(winner, loser, prob, no_of_features, total_population)

    return prob

probability_vector_iterator(30, 390, 200)

array([0.725, 0.47 , 0.405, 0.71 , 0.51 , 0.495, 0.175, 0.57 , 0.475,
       0.425, 0.495, 0.285, 0.415, 0.59 , 0.48 , 0.47 , 0.475, 0.9  ,
       0.385, 0.58 , 0.585, 0.715, 0.56 , 0.58 , 0.235, 0.43 , 0.455,
       0.19 , 0.545, 0.67 ])