In [1]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
import time
import scipy.spatial.distance as dt

In [2]:
## CARGAR LOS DATOS
def load_df(data_name, ind):
    train_set = pd.DataFrame()
    test_set = pd.DataFrame()
    i = ind
    a = [1,2,3,4,5]
    # print from ind-th index to (n+i)th index.
    while i < 5 + (ind - 1):
        txt = 'Instancias_APC/{d_name}_{n_data}.arff'
        #print(txt.format(d_name = data_name, n_data = a[i % 5]))
        raw_data = loadarff(txt.format(d_name = data_name, n_data = a[i % 5]))
        raw_df_data = pd.DataFrame(raw_data[0])
        train_set = pd.concat([train_set, raw_df_data], ignore_index=True)
        i = i + 1
        
    
    raw_data = loadarff(txt.format(d_name = data_name, n_data = ind))
    raw_df_data = pd.DataFrame(raw_data[0])
    test_set = pd.concat([test_set, raw_df_data], ignore_index=True)
    
    columns = train_set.columns[:-1] 
    for column in columns:
        min_value = min(test_set[column].min(),train_set[column].min())
        max_value = max(test_set[column].max(),train_set[column].max())
        train_set[column] = (train_set[column] - min_value) / (max_value - min_value)
        test_set[column] = (test_set[column] - min_value) / (max_value - min_value)
    

    if data_name == 'diabetes':
        
        values = train_set['class'].unique()
        train_set.loc[train_set['class'] == values[0], 'class'] = 0
        train_set.loc[train_set['class'] == values[1], 'class'] = 1
        test_set.loc[test_set['class'] == values[0], 'class'] = 0
        test_set.loc[test_set['class'] == values[1], 'class'] = 1
        
    return train_set, test_set

In [3]:
## GREEDY
def lookForEnemy(X_train, y_train, e_index):
    e_class = y_train[e_index]
    enemy = X_train.loc[y_train[y_train != e_class].index, :]
    A = enemy.copy()
    A = (A - X_train.iloc[e_index])**2
    A['d'] = 0
    A['d'] = np.sqrt(A.sum(axis = 1).values)
    index = A[A['d'] == A['d'].min()].index[0]                        
    return index

def lookForFriend(X_train, y_train, e_index):
    e_class = y_train[e_index]
    friend = X_train.loc[y_train[y_train == e_class].index, :]
    A = friend.copy()
    A = (A - X_train.iloc[e_index])**2
    A['d'] = 0
    A['d'] = np.sqrt(A.sum(axis = 1).values)
    index = A[A['d'] == (A[A['d'] != 0.0]['d'].min())].index[0]
    return index
            
        
def greedy(X_train, y_train):
    w = np.zeros(X_train.shape[1])
    for i in range(X_train.shape[0]):
        ec_index = lookForEnemy(X_train, y_train, i)
        ac_index = lookForFriend(X_train, y_train, i)
        w = w + abs(X_train.iloc[i].values - X_train.loc[ec_index].values) - abs(X_train.iloc[i].values - X_train.loc[ac_index].values)

    w_max = max(w)
    w[w>=0.1] = w[w>=0.1]/w_max
    w[w<0.1] = 0
    
    return w

In [4]:
## BUSQUEDA LOCAL
def busqueda_local(X_train, y_train, w, f_value, max_iter):
    iter_c = 0
    iter_eval = 0
    
    columns_t = np.arange(len(X_train.columns))
    
    max_eval = 20 * len(X_train.columns)
    
    while iter_c < max_iter and iter_eval < max_eval:
        
        if len(columns_t) == 0:
            columns_t = np.arange(len(X_train.columns))
        
        np.random.shuffle(columns_t)
        s = np.random.normal(loc = 0, scale = 0.3)

        w_new = w.copy()
        w_new[columns_t[0]] += s
        
        w_new[w_new > 1] = 1
        w_new[w_new < 0.1] = 0
        
        y_pred = validar_knn_train(X_train.copy(), y_train.copy(), w_new.copy())
        class_v, red_v, f_value_new = func(y_train.copy(), y_pred.copy(), w_new.copy())
        
        if f_value_new > f_value:
            w = w_new.copy()
            f_value = f_value_new
            iter_eval = 0
        else:
            iter_eval += 1
        
        columns_t = np.delete(columns_t, 0)
        iter_c += 1
        
    return w, f_value, iter_c

In [5]:
## VALIDACION

def validar_knn(X_train, y_train, x_test, w_true):
    w = w_true.copy()
    w[w<0.1] = 0
    w[w > 1] = 1
    X_train_p = pd.concat([X_train, x_test], ignore_index=True)
    dm = dt.pdist(X_train_p, metric = "euclidean", w = w)
    a = pd.DataFrame(dt.squareform(dm)) 
    indexes = a.loc[X_train.shape[0]:, :X_train.shape[0] - 1].idxmin(axis=1)
    y_pred = y_train[indexes].values
    return y_pred

def validar_knn_train(X_train, y_train, w_true):
    w = w_true.copy()
    y_pred = np.zeros(len(y_train))
    w[w < 0.1] = 0
    w[w > 1] = 1
    dm = dt.pdist(X_train, metric = "euclidean", w = w)
    a = pd.DataFrame(dt.squareform(dm))
    np.fill_diagonal(a.values, 99999)
    indexes = a.idxmin(axis = 1)
    y_pred = y_train[indexes].values 
    return y_pred
    
def func(y_true, y_pred, w_true):
    w = w_true.copy()
    w[w < 0] = 0
    w[w > 1] = 1
    arr_p = np.where((y_true-y_pred) == 0)
    aciertos = len(arr_p[0])
    
    tasa_class = 100.0*(aciertos/len(y_true))
    tasa_red = 100.0*(len(w[w<0.1])/len(w))

    return tasa_class, tasa_red, 0.8*tasa_class + 0.2*tasa_red

In [12]:
## CUERPO DEL PROGRAMA
datasets_names = ['diabetes', 'ozone-320', 'spectf-heart']

# Cambiamos esta variable para seleccionar el tipo de algoritmo que vamos a usar
using = 'knn' #Values 'greedy' 'busqueda_local' 'knn' 

np.random.seed(1)

for name in datasets_names:
    print("******** Exp :", name, "**************")
    print("Partition", "%_class", "%_red", "Fit", "T")
    mean_t = []
    mean_fit = []
    mean_class = []
    mean_red = []
    for i in range(5):
        train = pd.DataFrame()
        test = pd.DataFrame()
        data_name = name
        train, test = load_df(data_name, i+1)

        X_train = train
        if name == 'diabetes':
            y_train = train['class'].astype(int)
            X_train = X_train.drop(columns = ['class'])
        else:
            y_train = train['Class'].astype(int)
            X_train = X_train.drop(columns = ['Class'])

        x_test = test
        if name == 'diabetes':
            y_test = test['class'].astype(int)
            x_test = x_test.drop(columns = ['class'])
        else:
            y_test = test['Class'].astype(int)
            x_test = x_test.drop(columns = ['Class'])
        
        inicio = time.time()
        w = np.random.uniform(0, 1, X_train.shape[1])
        
        if using == 'greedy':
            w_bl = greedy(X_train.copy(), y_train.copy())
        elif using == 'busqueda_local':
            w_bl,a,b = busqueda_local(X_train.copy(), y_train.copy(), w.copy(), 0.0, 15000)
        elif using == 'knn':
            w_bl = np.ones(X_train.shape[1])
    
        y_pred = validar_knn(X_train.copy(), y_train.copy(), x_test.copy(), w_bl)
        fin = time.time()
        tiempo = (fin-inicio)
        class_v, red_v, f_value = func(y_test.copy(), y_pred.copy(), w_bl)
        print( i + 1, ",", class_v , "," , red_v, ",", f_value, ",", tiempo)
        mean_fit.append(f_value)
        mean_t.append(tiempo)
        mean_class.append(class_v)
        mean_red.append(red_v)

    print("Media %class", np.array(mean_class).mean())
    print("Media %red", np.array(mean_red).mean())
    print("Media fitness", np.array(mean_fit).mean())
    print("Media tiempo", np.array(mean_t).mean())


******** Exp : diabetes **************
Partition %_class %_red Fit T
1 , 66.88311688311688 , 0.0 , 53.50649350649351 , 0.005523681640625
2 , 66.88311688311688 , 0.0 , 53.50649350649351 , 0.02256035804748535
3 , 68.83116883116884 , 0.0 , 55.06493506493507 , 0.0
4 , 69.48051948051948 , 0.0 , 55.58441558441558 , 0.008001089096069336
5 , 69.73684210526315 , 0.0 , 55.78947368421052 , 0.0019998550415039062
Media %class 68.36295283663705
Media %red 0.0
Media fitness 54.69036226930964
Media tiempo 0.007616996765136719
******** Exp : ozone-320 **************
Partition %_class %_red Fit T
1 , 78.125 , 0.0 , 62.5 , 0.0
2 , 85.9375 , 0.0 , 68.75 , 0.0
3 , 79.6875 , 0.0 , 63.75 , 0.005000114440917969
4 , 76.5625 , 0.0 , 61.25 , 0.0
5 , 81.25 , 0.0 , 65.0 , 0.016118526458740234
Media %class 80.3125
Media %red 0.0
Media fitness 64.25
Media tiempo 0.00422372817993164
******** Exp : spectf-heart **************
Partition %_class %_red Fit T
1 , 77.14285714285715 , 0.0 , 61.71428571428572 , 0.00832271575

In [146]:
name = 'ozone-320'
np.random.seed(0)

for i in range(5):
    train = pd.DataFrame()
    test = pd.DataFrame()
    data_name = name
    train, test = load_df(data_name, i+1)

    X_train = train
    if name == 'diabetes':
        y_train = train['class'].astype(int)
        X_train = X_train.drop(columns = ['class'])
    else:
        y_train = train['Class'].astype(int)
        X_train = X_train.drop(columns = ['Class'])

    x_test = test
    if name == 'diabetes':
        y_test = test['class'].astype(int)
        x_test = x_test.drop(columns = ['class'])
    else:
        y_test = test['Class'].astype(int)
        x_test = x_test.drop(columns = ['Class'])

    w_bl = agg(X_train.copy(), y_train.copy(), tipo_cruce = 1)
    y_pred = validar_knn(X_train.copy(), y_train.copy(), x_test.copy(), w_bl)


    class_v, red_v, f_value = func(y_test.copy(), y_pred.copy(), w_bl)
    print(class_v , "," , red_v, ",", f_value)


78.125 , 38.88888888888889 , 70.27777777777777
81.25 , 40.27777777777778 , 73.05555555555556
82.8125 , 34.72222222222222 , 73.19444444444444
73.4375 , 30.555555555555557 , 64.86111111111111
82.8125 , 34.72222222222222 , 73.19444444444444


In [None]:
import random
def operar_cruces(tipo_cruce , next_gen, n_esperado_cruces):
    p_cru = 0.7
    if tipo_cruce == 1:
        alpha = 0.3
        iterator = 0
        while iterator < n_esperado_cruces:
            w_cru_1 = []
            w_cru_2 = []
            for j in range(len(next_gen[iterator])):
                cmax = max(next_gen[iterator][j], next_gen[iterator+1][j])
                cmin = min(next_gen[iterator][j], next_gen[iterator+1][j])
                i_cru = cmax - cmin
                w_cru_1.append(random.uniform(cmin - i_cru * alpha, cmax + i_cru * alpha))
                w_cru_2.append(random.uniform(cmin - i_cru * alpha, cmax + i_cru * alpha))

            w_cru_1 = np.array(w_cru_1)
            w_cru_1[w_cru_1 > 1] = 1
            w_cru_1[w_cru_1 < 0] = 0
            next_gen[iterator] = w_cru_1.copy()
            w_cru_2 = np.array(w_cru_2)
            w_cru_2[w_cru_2 > 1] = 1
            w_cru_2[w_cru_2 < 0] = 0
            next_gen[iterator+1] = w_cru_2.copy()
            
            iterator = iterator + 2

    elif tipo_cruce == 2:
        #Cruce aritmetico
        for i in range(0,n_esperado_cruces,2):
            alpha = np.random.uniform(0, 1)
            w_cru_1 = alpha * next_gen[i] + (1-alpha) * next_gen[i + 1]
            w_cru_2 = alpha * next_gen[i+1] + (1-alpha) * next_gen[i]

            w_cru_1 = np.array(w_cru_1)
            w_cru_1[w_cru_1 > 1] = 1
            w_cru_1[w_cru_1 < 0] = 0
            next_gen[i] = w_cru_1.copy()
                
            w_cru_2 = np.array(w_cru_2)
            w_cru_2[w_cru_2 > 1] = 1
            w_cru_2[w_cru_2 < 0] = 0
            next_gen[i+1] = w_cru_2.copy()
    
    return next_gen

def agg(X_train, y_train, tipo_cruce):
    ws = []
    ws_fitness = []
    vecinos = 50
    iter_ = 0
    max_iters = 15000
    #Generamos los primeros 50 cromosomas
    for i in range(vecinos):
        w = np.random.uniform(0, 1, X_train.shape[1])
        y_pred = validar_knn_train(X_train.copy(), y_train.copy(), np.array(w))
        class_v, red_v, f_value = func(y_train.copy(), y_pred, np.array(w))    
        ws.append(w)
        ws_fitness.append(f_value)
        
    while iter_ < max_iters:
        
        next_gen = []
        next_gen_fitness = []
           
        # Escogemos los padres
        for i in range(vecinos):
            i_1 = random.randint(0, len(ws)-1)
            i_2 = random.randint(0, len(ws)-1)

            if ws_fitness[i_1] > ws_fitness[i_2]:
                next_gen.append(ws[i_1])
            else:
                next_gen.append(ws[i_2])
  

        #Cruzamos los padres
        #Cruce blx
        new_next_gen = operar_cruces(tipo_cruce, next_gen.copy(), int(0.7 * vecinos))

        #Mutamos
        numero_mutados = int(len(new_next_gen) * 0.1)
        for i in range(numero_mutados):
            i_1 = random.randint(0, len(new_next_gen)-1)
            
            columns_t = np.arange(len(X_train.columns))
        
            np.random.shuffle(columns_t)
            s = np.random.normal(0, 0.3)

            new_next_gen[i_1][columns_t[0]] += s

            if  new_next_gen[i_1][columns_t[0]] > 1:
                new_next_gen[i_1][columns_t[0]] = 1
                

        # AQUI REEVALUAMOS LA NUEVAMOS TODA LA NUEVA POBLACION
        # ITERACIONES += 50
        for i in range(vecinos):
            y_pred = validar_knn_train(X_train.copy(), y_train.copy(), np.array(new_next_gen[i]))
            class_v, red_v, f_value = func(y_train.copy(), y_pred, np.array(new_next_gen[i]))    
            next_gen_fitness.append(f_value)
            
        iter_ += 50
        
        #Si la mejor solucion de la familia anterior no esta en la siguiente generacion
        #sustituimos la peor de la actual generacion por la mejor de la anterior
      
        idx_worst = np.where(next_gen_fitness == np.min(next_gen_fitness))[0][0]
        idx_best = np.where(ws_fitness == np.max(ws_fitness))[0][0]
  
        if abs((new_next_gen[idx_worst] - ws[idx_best]).sum()) > 0:
            new_next_gen[idx_worst] = ws[idx_best]
            next_gen_fitness[idx_worst] = ws_fitness[idx_best]
           
        ws = new_next_gen.copy()
        ws_fitness = next_gen_fitness.copy()
    
    idx_best = np.where(ws_fitness == np.max(ws_fitness))[0][0]

    return ws[idx_best]



def age(X_train, y_train, tipo_cruce):
    ws = []
    ws_fitness = []
    vecinos = 50
    iter_ = 0
    max_iters = 15000
    
    #Generamos los primeros 50 cromosomas
    for i in range(vecinos):
        w = np.random.uniform(0, 1, X_train.shape[1])
        y_pred = validar_knn_train(X_train.copy(), y_train.copy(), np.array(w))
        class_v, red_v, f_value = func(y_train.copy(), y_pred, np.array(w))    
        ws.append(w)
        ws_fitness.append(f_value)

    while iter_ < max_iters:
        
        next_gen = []
        dict_arr = {}
        
        for i in range(len(ws)):
            i_1 = random.randint(0, len(ws)-1)
            i_2 = random.randint(0, len(ws)-1)

            if ws_fitness[i_1] > ws_fitness[i_2]:
                next_gen.append(ws[i_1])
            else:
                next_gen.append(ws[i_2])
                

        #Cruzamos los padres
        next_gen = operar_cruces(tipo_cruce, next_gen, len(next_gen))
        
        s = np.random.normal(0, 0.3)
        #Mutamos
        for i in range(len(next_gen)):
            probability = np.random.uniform(0, 1)

            if probability <= 0.1:
                columns_t = np.arange(len(X_train.columns))

                np.random.shuffle(columns_t)
                s = np.random.normal(0, 0.3)

                next_gen[i][columns_t[0]] += s

                if  next_gen[i][columns_t[0]] > 1:
                    next_gen[i][columns_t[0]] = 1
                
        #Nos quedamos con los dos mejores a continuacion
        for i in range(2):
            y_pred = validar_knn_train(X_train.copy(), y_train.copy(), np.array(next_gen[i]))
            class_v, red_v, f_value = func(y_train.copy(), y_pred, np.array(next_gen[i]))    
            dict_arr[i-2] = f_value
                                                                                                              
        iter_ += 2
        
        idx_worst, idx_worst_2 = np.argpartition(ws_fitness, 1)[0:2] 
        
        
        dict_arr[idx_worst] = ws_fitness[idx_worst]
        dict_arr[idx_worst_2] = ws_fitness[idx_worst_2]
        
        #print(dict_arr)
        
        sorted(dict_arr.items(), key=lambda item: item[1], reverse=True)
        
        keysList = list(dict_arr.keys())
        changed = False
        for idx in range(2):
            if keysList[idx] < 0:
                if keysList[idx + 2] > 0 and changed == False:
                    ws[keysList[idx + 2]] = next_gen[keysList[idx]].copy()
                    ws_fitness[keysList[idx + 2]] = dict_arr[keysList[idx]]
                    changed = True
                elif keysList[idx + 2] < 0 or changed == False:
                    ws[keysList[idx + 3]] = next_gen[keysList[idx]].copy()
                    ws_fitness[keysList[idx + 3]] = dict_arr[keysList[idx]]

    
    idx_best = np.where(ws_fitness == np.max(ws_fitness))[0][0]

    return ws[idx_best]


71.42857142857143 , 12.5 , 59.642857142857146


In [54]:
w_bl = age(X_train.copy(), y_train.copy(), tipo_cruce = 1)

y_pred = validar_knn_train(x_test.copy(), y_test.copy(), w_bl)
class_v, red_v, f_value = func(y_test.copy(), y_pred.copy(), w_bl)
print(class_v , "," , red_v, ",", f_value)

70.39473684210526 , 37.5 , 63.81578947368421


In [9]:
w = np.array([1,0,2,3])


In [13]:
print(A, B)

1 0
