In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from gaGen import GenGeneticAlgorithm
import copy

In [2]:
def evaluate(X_train : pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, random_state = None):
    
    clf = DecisionTreeClassifier(random_state= random_state)
    
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    precision = precision_score(predictions,y_test, average='macro')
    f1 = f1_score(predictions,y_test, average='macro')
    recall = recall_score(predictions,y_test, average='macro')
    
    return {"precision":precision, "f1":f1, "recall":recall}

In [3]:
dataset_name= "LSVT voice"

data = pd.read_csv(f"../Datasets/{dataset_name}.csv")

data.replace('?', np.nan, inplace=True)

data.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V302,V303,V304,V305,V306,V307,V308,V309,V310,Class
0,0.088112,0.041697,0.00048,-4e-06,0.000422,2.458381,1e-06,47.021079,1366.43039,-7.103323,...,7.088978,19.753255,54.335046,145.52863,375.097397,921.296579,2137.079844,4697.131077,9931.208257,1
1,0.161798,0.057364,0.000677,5e-06,0.000206,2.592066,1e-06,93.557936,2582.922776,-23.284761,...,7.977363,22.203504,60.993338,163.560972,421.010306,1036.092589,2404.072562,5284.082128,11165.095662,2
2,0.554508,0.642913,0.007576,-1e-06,0.006488,12.691326,0.000695,52.988422,466.682635,-45.30868,...,5.078616,14.135923,38.641654,103.466808,264.654626,649.65709,1507.384591,3315.804236,6974.600636,2
3,0.031089,0.027108,0.000314,0.0,0.000216,0.754288,0.0,13.982754,417.217249,-1.207741,...,5.610448,15.626164,42.943275,115.014975,296.320795,728.284936,1689.586636,3713.818933,7851.13936,1
4,0.076177,0.039071,0.000302,2.7e-05,0.001102,1.270034,4.9e-05,56.373996,1608.31741,-3.49199,...,6.902199,19.117609,52.715873,141.113865,363.511021,893.246151,2071.625622,4554.204815,9623.566242,2
5,0.224738,0.122597,0.001437,3e-06,0.000932,2.787218,3e-06,51.817177,1113.744107,-36.569114,...,6.795204,18.941736,52.039946,139.465979,359.217612,882.761062,2044.991935,4496.777621,9504.445938,2
6,0.032581,0.024174,0.00028,-4e-06,0.000236,0.906992,0.0,17.916749,539.927763,-1.421599,...,5.987349,16.65658,45.664487,122.416506,315.209203,775.269129,1799.191348,3954.80686,8351.598804,1
7,0.090836,0.042273,0.000494,-4e-06,0.00046,3.282792,1e-06,18.997218,663.965399,-2.600535,...,4.997977,14.058076,38.996714,107.356655,285.229355,722.96285,1797.956272,4024.020067,8956.809979,2
8,0.148758,0.084805,0.000982,-1.3e-05,0.000832,1.321324,2e-06,35.99167,899.310287,-14.569332,...,6.585153,18.355605,50.417628,135.161654,348.306253,853.665328,1981.334589,4357.928487,9219.158652,2
9,0.055767,0.025802,0.000285,-8e-06,0.000401,1.155191,1e-06,47.307997,1392.038999,-4.919125,...,7.190456,20.027376,54.98401,147.380361,379.623517,933.131317,2164.248742,4757.142198,10054.337528,1


In [4]:
X = data.iloc[:, :-1]

y = data.iloc[:, -1]

In [5]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state= 42)

before_scores= []
selected_scores= []
all_scores = []
columns_number= []

for index,(train_index, test_index) in enumerate(skf.split(X, y)):
    
    fold_x_train =X.iloc[train_index]
    fold_y_train = y.iloc[train_index]
    
    fold_x_test =X.iloc[test_index]
    fold_y_test = y.iloc[test_index]
    
    score_before = evaluate(fold_x_train, fold_y_train, fold_x_test, fold_y_test)
    before_scores.append(score_before)
    
    print(f'''For the original data:
    Columns : {fold_x_train.columns}
    Columns Number : {len(fold_x_train.columns)} 
    precision : {score_before["precision"]}
    f1        : {score_before["f1"]}
    recall    : {score_before["recall"]}
          ''')
    print("Ga Gen started with inputs ......")
    ga= GenGeneticAlgorithm(
        X_train= fold_x_train,
        X_test= fold_x_test,
        y_train= fold_y_train,
        y_test= fold_y_test,
        rnd_state=None
    )

    selected_new_X_train , selected_new_X_test, all__new_X_train ,all_new_X_test = ga.run(population_size= 10, generation_limit= 20, mutation_probability= 0.1)
    
    print("Ga Gen finished ...")

    score_after= evaluate(all__new_X_train, fold_y_train, all_new_X_test, fold_y_test)
    all_scores.append(score_after)
    
    print(f'''\nFor the new generated data with gaGen without selection:
    Columns : {all_new_X_test.columns}
    Columns Number : {len(all_new_X_test.columns)} 
    precision : {score_after["precision"]}
    f1        : {score_after["f1"]}
    recall    : {score_after["recall"]}
    ''')
    score_after= evaluate(selected_new_X_train, fold_y_train, selected_new_X_test, fold_y_test)
    selected_scores.append(score_after)
    
    columns_number.append(len(selected_new_X_test.columns))
    
    print(f'''\nFor the new generated data with gaGen with selection:
    Columns : {selected_new_X_test.columns}
    Columns Number : {len(selected_new_X_test.columns)} 
    precision : {score_after["precision"]}
    f1        : {score_after["f1"]}
    recall    : {score_after["recall"]}
--------------------------------------------------------------------------------------''')
    

For the original data:
    Columns : Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       ...
       'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309',
       'V310'],
      dtype='object', length=310)
    Columns Number : 310 
    precision : 0.55
    f1        : 0.5357142857142857
    recall    : 0.5476190476190476
          
Ga Gen started with inputs ......


In [None]:
sums = {}
counts = {}

for score in before_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Original Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Original Data:
precision: 0.74
f1: 0.736403993992235
recall: 0.7488688285398812


In [None]:
sums = {}
counts = {}

for score in all_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Generated without fs Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Generated without fs Data:
precision: 0.7533333333333333
f1: 0.7479768781665334
recall: 0.7630914673561733


In [None]:
sums = {}
counts = {}

for score in selected_scores:
    for key, value in score.items():
        sums[key] = sums.get(key, 0) + value
        counts[key] = counts.get(key, 0) + 1

means = {key: sums[key] / counts[key] for key in sums}

print("Generated with fs Data:")
for prop, mean in means.items():
    print(prop + ":", mean)

Generated with fs Data:
precision: 0.7516666666666667
f1: 0.7495611534059811
recall: 0.7573098470157295


In [None]:
print(f"Columns generation: {np.mean(columns_number)}")

Columns generation: 84.8
