In [3]:
from google.colab import drive
drive.mount('/content/drive') 
root_path = 'drive/My Drive/ML_DATA/' 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

Attribute Information:

1. Sample code number: id number
2. Clump Thickness: 1 - 10
3. Uniformity of Cell Size: 1 - 10
4. Uniformity of Cell Shape: 1 - 10
5. Marginal Adhesion: 1 - 10
6. Single Epithelial Cell Size: 1 - 10
7. Bare Nuclei: 1 - 10
8. Bland Chromatin: 1 - 10
9. Normal Nucleoli: 1 - 10
10. Mitoses: 1 - 10
11. Class: (2 for benign, 4 for malignant)

In [5]:
df = pd.read_csv(root_path+"breast-cancer-wisconsin.csv", header=None)

columns = ["id number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin" , "Normal Nucleoli", "Mitoses", "Class"]

df.columns = columns

df = df.sample(frac=1)

df.info()

df.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 355 to 461
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id number                    699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 65.5+ KB


Unnamed: 0,id number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
355,857774,4,1,1,1,3,1,2,2,1,2
429,1276091,2,1,1,1,2,1,2,1,1,2
408,1236837,2,3,2,2,2,2,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
528,807657,6,1,3,2,2,1,1,1,1,2
77,1133041,5,3,1,2,2,1,2,1,1,2
488,1076352,3,6,4,10,3,3,3,4,1,4
438,558538,4,1,3,3,2,1,1,1,1,2
54,1110524,10,5,5,6,8,8,7,1,1,4
122,1174131,10,10,10,2,10,10,5,3,3,4


In [0]:
# remove all ? marked rows
df = df[df.ne("?").all(1)]

In [0]:
train_df = df.drop(["id number", "Class"], axis = 1)
target = df["Class"]

In [0]:
scaler = MinMaxScaler()

cols = train_df.columns

sc_train = scaler.fit_transform(train_df)

train_df_scaled = pd.DataFrame(sc_train, columns = cols)

In [0]:
def decode(chromosome, features):
  features_list = []
  for i in range(len(features)):
    if chromosome[i] == 1:
      features_list.append(features[i])

  #print(features_list)
  return  features_list

def score(clf, X, y, features, cv = 3):
    return cross_val_score(clf, X[features], y, cv=3).mean()

def select_parents(population_list):
  tournament_1_index = np.random.randint(len(population_list), size = 3)
  tournament_1_index_scores = [population_list[tournament_1_index[0]]["score"], population_list[tournament_1_index[1]]["score"], population_list[tournament_1_index[2]]["score"]] 
  winner_1 = tournament_1_index[np.argmax(tournament_1_index_scores)]

  tournament_2_index = np.random.randint(len(population_list), size = 3)
  tournament_2_index_scores = [population_list[tournament_2_index[0]]["score"], population_list[tournament_2_index[1]]["score"], population_list[tournament_2_index[2]]["score"]] 
  winner_2 = tournament_2_index[np.argmax(tournament_2_index_scores)]

  return [winner_1, winner_2]


def genetic_algo(clf, X, y, features, p_c=1, p_m = 0.2, ini_pop=100, num_gen = 30, k_fold = 3):
  population = np.random.randint(2, size=(ini_pop, len(X.columns)))
  #print(population)
  population_list = []
    
  for i in population:
      pop_dict = {}
      pop_dict["gen"] = 0
      pop_dict["chromosome"] = i

      attributes = decode(i, features)
      score_mean = score(clf, X, y, attributes, k_fold)
      pop_dict["score"] = score_mean
      #print(pop_dict)
      population_list.append(pop_dict)
      
    

  for gen in range(1, num_gen+1):
    print("procesing gen no: ", gen)
    for pop_size in range(ini_pop//2):

      parent_index = select_parents(population_list)

      parent_1 = population_list[parent_index[0]]["chromosome"]
      parent_2 = population_list[parent_index[1]]["chromosome"]

      child_1 = []
      child_2 = []

      # CrossOver
      if np.random.rand() < p_c:
        split_index = np.random.randint(len(X.columns), size = 2)
        min_i = min(split_index)
        max_i = max(split_index)

        child_1 = list(parent_1[:min_i]) + list(parent_2[min_i:max_i]) + list(parent_1[max_i:])
        child_2 = list(parent_2[:min_i]) + list(parent_1[min_i:max_i]) + list(parent_2[max_i:])
      else:
        child_1 = parent_1
        child_2 = parent_2

      # Mutation for child 1
      for i in range(len(X.columns)):
        if np.random.rand() < p_m:
          child_1[i] = 1 - child_1[i]      

      # Mutation for child 2
      for i in range(len(X.columns)):
        if np.random.rand() < p_m:
          child_2[i] = 1 - child_2[i] 

      child_1_features = decode(child_1, features)
      child_2_features = decode(child_2, features)   

      child_1_score = score(clf, X, y, child_1_features, k_fold)
      child_2_score = score(clf, X, y, child_2_features, k_fold) 
    
      #print(child_1_score)
      #print(child_2_score)

      population_list = sorted(population_list, key=lambda k: k['score']) 

      if child_1_score > child_2_score:
        if child_1_score > population_list[1]["score"]:
          child_1_dict = {"gen":gen, "chromosome":child_1, "score": child_1_score}
          if child_2_score > population_list[0]["score"]:
            child_2_dict = {"gen":gen, "chromosome":child_2, "score": child_2_score}
            population_list[0] = child_2_dict
            population_list[1] = child_1_dict
          else:
            population_list[1] = child_1_dict  
        else:
          if child_1_score > population_list[0]["score"]:
            child_1_dict = {"gen":gen, "chromosome":child_1, "score": child_1_score}
            population_list[0] = child_1_dict

      elif child_2_score > child_1_score:
        if child_2_score > population_list[1]["score"]:
          child_2_dict = {"gen":gen, "chromosome":child_2, "score": child_2_score}
          if child_1_score > population_list[0]["score"]:
            child_1_dict = {"gen":gen, "chromosome":child_1, "score": child_1_score}
            population_list[0] = child_1_dict
            population_list[1] = child_2_dict
          else:
            population_list[1] = child_2_dict 

        else:
          if child_2_score > population_list[0]["score"]:
            child_2_dict = {"gen":gen, "chromosome":child_2, "score": child_2_score}
            population_list[0] = child_2_dict   

      else:
        if child_2_score > population_list[0]["score"]:
          child_2_dict = {"gen":gen, "chromosome":child_2, "score": child_2_score}
          population_list[0] = child_2_dict 

  population_list = sorted(population_list, key=lambda k: k['score']) 
  print(population_list) 
  print(decode(population_list[-1]["chromosome"], features))

In [24]:
clf = SVC()
clf.fit(train_df, target)
print(cross_val_score(clf, train_df, target, cv=3).mean())

0.9707344720096865


In [26]:
clf = MLPClassifier()
clf.fit(train_df, target)
print(cross_val_score(clf, train_df, target, cv=3).mean())

0.9663420666202952


In [25]:
clf = SVC()
genetic_algo(clf, train_df, target, train_df.columns, p_c=1, p_m = 0.2, ini_pop=100, num_gen = 10, k_fold = 3)

procesing gen no:  1
0.9648736378390911
0.9707473529639076
0.9516835407166963
0.9663420666202952
0.9590321250998274
0.9487531236313987
0.9341139191591313
0.9648865187933122
0.9516899811938068
0.9722029007908906
0.9341203596362418
0.9634180900121082
0.9487917664940619
0.9663485070974058
0.937044336244429
0.9619303397995723
0.9648929592704228
0.9707473529639076
0.9692724837055929
0.9253613107659016
0.9678104954014993
0.951715743102249
0.9634116495349975
0.9736648890949842
0.8975126877399077
0.9561017080145296
0.9707409124867971
0.9663420666202952
0.9707409124867971
0.9692789241827034
0.9575636963186233
0.9604876729268105
0.9634116495349975
0.956056624674756
0.9736648890949842
0.9736777700492052
0.9560823865831981
0.9692724837055929
0.9575636963186233
0.9692724837055929
0.9663420666202952
0.9605005538810315
0.9575636963186233
0.9678169358786098
0.9678169358786098
0.9560759461060876
0.9663420666202952
0.9692853646598141
0.9692724837055929
0.9692918051369247
0.9692724837055929
0.95316485045

In [30]:
clf = MLPClassifier()
genetic_algo(clf, train_df, target, train_df.columns, p_c=1, p_m = 0.3, ini_pop=50, num_gen = 10, k_fold = 3)

procesing gen no:  1
procesing gen no:  2
procesing gen no:  3
procesing gen no:  4
procesing gen no:  5
procesing gen no:  6
procesing gen no:  7
procesing gen no:  8
procesing gen no:  9
procesing gen no:  10
[{'gen': 8, 'chromosome': [1, 1, 0, 1, 1, 1, 1, 0, 1], 'score': 0.9692789241827034}, {'gen': 1, 'chromosome': [1, 1, 1, 1, 0, 1, 0, 0, 0], 'score': 0.969285364659814}, {'gen': 0, 'chromosome': array([1, 1, 1, 1, 0, 1, 0, 0, 0]), 'score': 0.969285364659814}, {'gen': 9, 'chromosome': [1, 1, 0, 1, 0, 1, 0, 1, 0], 'score': 0.9692853646598141}, {'gen': 3, 'chromosome': [1, 1, 0, 0, 0, 1, 1, 0, 1], 'score': 0.9707280315325759}, {'gen': 2, 'chromosome': [1, 1, 0, 1, 0, 1, 1, 0, 1], 'score': 0.9707280315325759}, {'gen': 6, 'chromosome': [1, 1, 1, 1, 0, 1, 1, 1, 1], 'score': 0.9707344720096865}, {'gen': 2, 'chromosome': [1, 1, 0, 0, 0, 1, 1, 0, 0], 'score': 0.9707344720096865}, {'gen': 10, 'chromosome': [1, 1, 1, 1, 1, 1, 0, 0, 1], 'score': 0.9707409124867971}, {'gen': 10, 'chromosome': 