

# imported necessary libraries



In [None]:
import numpy as np
import pandas as pd
from random import randint
import random
from typing import List, Tuple

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# Breast Cancer Dataset


*   read and splitted the dataset from pandas dataframe
*   took list of classifiers for performance analysis
*   size of dataset after splitting is shown



In [None]:
df = pd.read_csv("/content/drive/MyDrive/test_data/data.csv")
y = df["diagnosis"]
y = np.where(y == 'M',1,0)
df.drop(["id", "diagnosis", "Unnamed: 32"], axis = 1, inplace = True)

classifiers = ['LinearSVM', 'RadialSVM', 
                'Logistic',  'RandomForest',
                'NaiveBayes', 'DecisionTree',
                'knn']
    
models_list = [svm.SVC(kernel='linear'),
               svm.SVC(kernel='rbf'),
               LogisticRegression(max_iter = 4000),
               RandomForestClassifier(n_estimators = 200, random_state = 0),
               GaussianNB(),
               DecisionTreeClassifier(random_state = 0),
               KNeighborsClassifier()]

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.20, random_state = 42)

print(f'Total X_train data : {X_train.shape}\n')
print(f'Total X_test data : {X_test.shape}\n')
print(f'Total y_train data : {y_train.shape}\n')
print(f'Total y_test data : {y_test.shape}')

Total X_train data : (455, 30)

Total X_test data : (114, 30)

Total y_train data : (455,)

Total y_test data : (114,)


# Function to determine best classifier

In [None]:
def best_classifier(X, y):

    scores_list = list()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

    for model in models_list:
        model.fit(X_train, y_train)
        predicted_labels = model.predict(X_test)
        score = accuracy_score(predicted_labels, y_test)*100
        scores_list.append(score)

    df = pd.DataFrame({"Classifier":classifiers})
    df["Accuracy"] = scores_list
    df.sort_values(by = "Accuracy", ascending = False,inplace = True)

    return df

best_classifier(df, y)

Unnamed: 0,Classifier,Accuracy
4,NaiveBayes,97.368421
3,RandomForest,96.491228
0,LinearSVM,95.614035
2,Logistic,95.614035
6,knn,95.614035
1,RadialSVM,94.736842
5,DecisionTree,93.859649


# Initial population function



---


*   size parameter indicate size of population.
*   no_of_features indicate number of features present in dataset.
*   almost 20% of features are seleced as false randomly whereas remaining 80% remained true.
*   returns a list of boolean population of desired size.



---



In [None]:
def initial_population(size: int, no_of_features: int) -> list:

    population = list()
    array_index = int(no_of_features*0.2)

    for i in range(size):
        chromosome = np.ones(no_of_features, dtype = bool)     
        chromosome[:array_index] = False
        np.random.shuffle(chromosome)
        population.append(chromosome)

    return population

# Fitness function


*   takes population of features returned from initial_population function.
*   takes each individual from population and drops features where boolean 'False' occurs.
*    measures the accuracy of the model by using active 'True' features
*    stores the accuracy scores in scores_list
*    fits the individuals in fitted_population list according to the highest accuracy score.
*    returns scores_list and fitted_population lists



In [None]:
def fitness_function(population: list, model: any) -> Tuple[np.ndarray, np.ndarray]:

    scores_list = list()
    fitted_population = list()

    for chromosome in population:
        model.fit(X_train.iloc[:, chromosome], y_train)
        predicted = model.predict(X_test.iloc[:, chromosome])
        scores_list.append(accuracy_score(y_test, predicted))

    scores_list, fitted_population = np.array(scores_list), np.array(population)
    index_of_sorted_score = np.argsort(scores_list)
                                         
    return scores_list[index_of_sorted_score][::-1], fitted_population[index_of_sorted_score, :][::-1]

# Selection Function


*   size of population is counted
*   80% of fitted population is taken for 



In [None]:
def selection_function(fitted_population: np.ndarray) -> list:

    population_size = 0
    for _ in fitted_population:
        population_size += 1

    updated_size = int(0.8*population_size) if int(0.8*population_size) % 2 == 0 else int(0.8*population_size) - 1
    
    selected_population = list()
    for i in range(updated_size):
        selected_population.append(fitted_population[i])

    return selected_population

# Function to perform two point crossover

In [None]:
def multi_point_crossover(selected_population: list) -> list:

    new_population = list()

    for parent in range(0, len(selected_population), 2):
        child = np.array([])
        parent_1, parent_2 = selected_population[parent], selected_population[parent + 1]
        child_1 = np.concatenate((parent_1[:len(parent_1)//4], parent_2[len(parent_2)//4:len(parent_2)//2], parent_1[len(parent_1)//2:]))
        child_2 = np.concatenate((parent_2[:len(parent_2)//4], parent_1[len(parent_2)//4:len(parent_2)//2], parent_2[len(parent_1)//2:]))
        new_population.append(child_1)
        new_population.append(child_2)

    return new_population

# Function to perform mulation

In [None]:
def mutation(new_population: np.ndarray, prob: float, number_of_features) -> np.ndarray:

    mutated_population = list()
    
    for chromosome in new_population:
        index = np.random.randint(len(chromosome))
        chromosome[index] = chromosome[index] if random.random() > int(prob*number_of_features) else not chromosome[index]
        mutated_population.append(chromosome)

    return mutated_population

# generation_generator function


---




*   creates initial population of desired size
*   prints the fitness score of best fitted indivudual from population
*   performs slecetion, crossover and mutation
*   goes on to next generation


---



In [None]:
def generation_generator(size_of_population, number_of_features, model, number_of_generation, prob):

    population = initial_population(size_of_population, number_of_features)

    for generation in range(number_of_generation):
        fitness_score, fitted_population = fitness_function(population, model)
        print(f'The fitness score in generation : {generation + 1} is {fitness_score[1]}')
        selected_population = selection_function(fitted_population)
        new_population = multi_point_crossover(selected_population)
        population = mutation(new_population, prob, number_of_features)

    return population


final_population = generation_generator(200, 30, models_list[5], 5, 0.08)

The fitness score in generation : 1 is 0.9736842105263158
The fitness score in generation : 2 is 0.9649122807017544
The fitness score in generation : 3 is 0.9736842105263158
The fitness score in generation : 4 is 0.9736842105263158
The fitness score in generation : 5 is 0.9736842105263158


# best_features function



---


*   finds out the best features from training dataset
*   drops the less important features 


---




In [None]:
def best_features(final_population):
    df_f = df.iloc[:, final_population[0]]
    return df_f

best_features(final_population)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,concavity_mean,concave points_mean,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,concavity_worst,symmetry_worst
0,17.99,10.38,122.80,1001.0,0.30010,0.14710,0.9053,8.589,153.40,0.006399,0.04904,0.01587,0.03003,0.006193,25.380,17.33,184.60,2019.0,0.7119,0.4601
1,20.57,17.77,132.90,1326.0,0.08690,0.07017,0.7339,3.398,74.08,0.005225,0.01308,0.01340,0.01389,0.003532,24.990,23.41,158.80,1956.0,0.2416,0.2750
2,19.69,21.25,130.00,1203.0,0.19740,0.12790,0.7869,4.585,94.03,0.006150,0.04006,0.02058,0.02250,0.004571,23.570,25.53,152.50,1709.0,0.4504,0.3613
3,11.42,20.38,77.58,386.1,0.24140,0.10520,1.1560,3.445,27.23,0.009110,0.07458,0.01867,0.05963,0.009208,14.910,26.50,98.87,567.7,0.6869,0.6638
4,20.29,14.34,135.10,1297.0,0.19800,0.10430,0.7813,5.438,94.44,0.011490,0.02461,0.01885,0.01756,0.005115,22.540,16.67,152.20,1575.0,0.4000,0.2364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.24390,0.13890,1.2560,7.673,158.70,0.010300,0.02891,0.02454,0.01114,0.004239,25.450,26.40,166.10,2027.0,0.4107,0.2060
565,20.13,28.25,131.20,1261.0,0.14400,0.09791,2.4630,5.203,99.04,0.005769,0.02423,0.01678,0.01898,0.002498,23.690,38.25,155.00,1731.0,0.3215,0.2572
566,16.60,28.08,108.30,858.1,0.09251,0.05302,1.0750,3.425,48.55,0.005903,0.03731,0.01557,0.01318,0.003892,18.980,34.12,126.70,1124.0,0.3403,0.2218
567,20.60,29.33,140.10,1265.0,0.35140,0.15200,1.5950,5.772,86.22,0.006522,0.06158,0.01664,0.02324,0.006185,25.740,39.42,184.60,1821.0,0.9387,0.4087
