In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from gaGen import GenGeneticAlgorithm
import copy

In [11]:
def evaluate(X_train : pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, random_state = None):
    
    clf = DecisionTreeClassifier(random_state= random_state)
    
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    precision = precision_score(predictions,y_test, average='macro')
    f1 = f1_score(predictions,y_test, average='macro')
    recall = recall_score(predictions,y_test, average='macro')
    
    return {"precision":precision, "f1":f1, "recall":recall}

In [12]:
dataset_name= "glass"

data = pd.read_csv(f"../Datasets/{dataset_name}.csv")

data.head(10)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0.0,0.0,'build wind float'
1,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.0,0.0,'vehic wind float'
2,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0,'build wind float'
3,1.51299,14.4,1.74,1.54,74.55,0.0,7.59,0.0,0.0,tableware
4,1.53393,12.3,0.0,1.0,70.16,0.12,16.19,0.0,0.24,'build wind non-float'
5,1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,'build wind non-float'
6,1.51779,13.64,3.65,0.65,73.0,0.06,8.93,0.0,0.0,'vehic wind float'
7,1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0.0,0.0,'build wind float'
8,1.51545,14.14,0.0,2.68,73.39,0.08,9.07,0.61,0.05,headlamps
9,1.51789,13.19,3.9,1.3,72.33,0.55,8.44,0.0,0.28,'build wind non-float'


In [13]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [14]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state= 42)

before_scores= []
selected_scores= []
all_scores = []
columns_number= []

for index,(train_index, test_index) in enumerate(skf.split(X, y)):
    
    fold_x_train =X.iloc[train_index]
    fold_y_train = y.iloc[train_index]
    
    fold_x_test =X.iloc[test_index]
    fold_y_test = y.iloc[test_index]
    
    score_before = evaluate(fold_x_train, fold_y_train, fold_x_test, fold_y_test)
    before_scores.append(score_before)
    
    print(f'''For the original data:
    Columns : {fold_x_train.columns}
    Columns Number : {len(fold_x_train.columns)} 
    precision : {score_before["precision"]}
    f1        : {score_before["f1"]}
    recall    : {score_before["recall"]}
          ''')
    print("Ga Gen started with inputs ......")
    ga= GenGeneticAlgorithm(
        X_train= fold_x_train,
        X_test= fold_x_test,
        y_train= fold_y_train,
        y_test= fold_y_test,
        rnd_state=None
    )

    selected_new_X_train , selected_new_X_test, all__new_X_train ,all_new_X_test = ga.run(population_size= 20, generation_limit= 20, mutation_probability= 0.1)
    
    print("Ga Gen finished ...")

    score_after= evaluate(all__new_X_train, fold_y_train, all_new_X_test, fold_y_test)
    all_scores.append(score_after)
    
    print(f'''\nFor the new generated data with gaGen without selection:
    Columns : {all_new_X_test.columns}
    Columns Number : {len(all_new_X_test.columns)} 
    precision : {score_after["precision"]}
    f1        : {score_after["f1"]}
    recall    : {score_after["recall"]}
    ''')
    score_after= evaluate(selected_new_X_train, fold_y_train, selected_new_X_test, fold_y_test)
    selected_scores.append(score_after)
    
    columns_number.append(len(selected_new_X_test.columns))
    
    print(f'''\nFor the new generated data with gaGen with selection:
    Columns : {selected_new_X_test.columns}
    Columns Number : {len(selected_new_X_test.columns)} 
    precision : {score_after["precision"]}
    f1        : {score_after["f1"]}
    recall    : {score_after["recall"]}
--------------------------------------------------------------------------------------''')
    

For the original data:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')
    Columns Number : 9 
    precision : 0.5535714285714285
    f1        : 0.5136752136752137
    recall    : 0.503968253968254
          
Ga Gen started with inputs ......


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.6190476190476191
    f1        : 0.560515873015873
    recall    : 0.5370370370370371
    

For the new generated data with gaGen with selection:
    Columns : Index(['Na', 'Al', 'Si', 'K', 'Ba', 'Fe', 'RI_Na___addition',
       'RI_Na___multiplication', 'RI_Mg___addition', 'RI_Al___addition',
       'RI_Al___multiplication', 'RI_Si___addition', 'RI_Si___average',
       'RI_Si___multiplication', 'RI_K___addition', 'RI_Ca___addition',
       'RI_Ca___multiplication', 'RI_Ba___addition'

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.6309523809523809
    f1        : 0.6011904761904762
    recall    : 0.5861111111111111
    

For the new generated data with gaGen with selection:
    Columns : Index(['Na', 'Mg', 'Al', 'Ba', 'RI_Na___addition', 'RI_Na___multiplication',
       'RI_Mg___average', 'RI_Si___addition', 'RI_Si___average',
       'RI_K___addition', 'RI_K___multiplication', 'RI_Ca___average',
       'RI_Ba___average', 'RI_Fe___addition', 'RI_Fe___average',
       'RI_Fe___multiplication', 'Na_Mg___multiplica

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.44047619047619047
    f1        : 0.4039682539682539
    recall    : 0.38472222222222224
    

For the new generated data with gaGen with selection:
    Columns : Index(['Al', 'Si', 'Fe', 'RI_Na___average', 'RI_Na___multiplication',
       'RI_Al___multiplication', 'RI_Si___addition', 'RI_K___addition',
       'RI_K___multiplication', 'RI_Ba___multiplication',
       'RI_Fe___multiplication', 'Na_Mg___addition', 'Na_Mg___average',
       'Na_Mg___multiplication', 'Na_Al___multiplicatio

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.8011904761904762
    f1        : 0.7999999999999999
    recall    : 0.8011904761904762
    

For the new generated data with gaGen with selection:
    Columns : Index(['RI', 'Ca', 'Ba', 'Fe', 'RI_Na___average', 'RI_Al___multiplication',
       'RI_Si___addition', 'RI_Si___multiplication', 'RI_K___addition',
       'RI_K___average', 'RI_K___multiplication', 'RI_Ca___multiplication',
       'Na_Si___average', 'Na_Si___multiplication', 'Na_Ca___addition',
       'Na_Ca___average', 'Na_Ba_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.6230158730158729
    f1        : 0.6063492063492063
    recall    : 0.5952380952380952
    

For the new generated data with gaGen with selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'Ba', 'RI_Na___addition',
       'RI_Na___multiplication', 'RI_Mg___multiplication', 'RI_Al___addition',
       'RI_Al___average', 'RI_Al___multiplication', 'RI_Si___multiplication',
       'RI_Ca___average', 'RI_Fe___addition', 'RI_Fe___average',
       'Na_Mg___addition', 'Na_Al___average'

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.6230158730158729
    f1        : 0.5619047619047619
    recall    : 0.5777777777777778
    

For the new generated data with gaGen with selection:
    Columns : Index(['Na', 'Mg', 'Al', 'K', 'Ba', 'Fe', 'RI_Na___average',
       'RI_Mg___multiplication', 'RI_K___addition', 'RI_Ba___addition',
       'RI_Ba___average', 'Na_Mg___multiplication', 'Mg_Al___addition',
       'Mg_Si___addition', 'Mg_K___addition', 'Mg_Fe___multiplication',
       'Al_Si___addition', 'Al_Si___multiplication',

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Ga Gen finished ...

For the new generated data with gaGen without selection:
    Columns : Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'RI_Na___addition',
       ...
       'K_Fe___multiplication', 'Ca_Ba___addition', 'Ca_Ba___average',
       'Ca_Ba___multiplication', 'Ca_Fe___addition', 'Ca_Fe___average',
       'Ca_Fe___multiplication', 'Ba_Fe___addition', 'Ba_Fe___average',
       'Ba_Fe___multiplication'],
      dtype='object', length=117)
    Columns Number : 117 
    precision : 0.9553571428571429
    f1        : 0.9553571428571429
    recall    : 0.9553571428571429
    

For the new generated data with gaGen with selection:
    Columns : Index(['Ca', 'Ba', 'Fe', 'RI_Na___addition', 'RI_Al___average',
       'RI_Al___multiplication', 'RI_Si___addition', 'RI_Si___multiplication',
       'RI_K___addition', 'RI_Ca___addition', 'RI_Ca___average',
       'RI_Ca___multiplication', 'RI_Ba___average', 'RI_Fe___average',
       'Na_Mg___average', 'Na_Al___multiplication'

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
sums = {}
counts = {}

for score in before_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Original Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Original Data:
precision: 0.6201190476190476
f1: 0.6069691800574154
recall: 0.6215873015873016


In [16]:
sums = {}
counts = {}

for score in all_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Generated without fs Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Generated without fs Data:
precision: 0.6491666666666666
f1: 0.619694240944241
recall: 0.6189580327080327


In [17]:
sums = {}
counts = {}

for score in selected_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Generated with fs Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Generated with fs Data:
precision: 0.6369642857142856
f1: 0.6019569929554449
recall: 0.5979996392496393


In [18]:
print(f"Columns generation: {np.mean(columns_number)}")

Columns generation: 51.0
