In [None]:
import numpy as np
import matplotlib.pyplot as plt

import joblib

import random
from deap import base
from deap import creator
from deap import tools

from deap import algorithms
from time import time
import multiprocessing

import array
import random
import json

import os

import pandas as pd

In [None]:
matrix_np = joblib.load('matrix_np.pkl')

In [None]:
#reduced_matrix_np = joblib.load('reduced_matrix_np.pkl')

# Functions

## Modify parameters

In [None]:
matrix_algorithm = matrix_np
NUM_CLUSTERS = 7

# Distance to cluster function

In [None]:
def auxiliarFunction(documents, dimensions, number_topics):

    counter_docs = [0 for x in range(0,number_topics)]
    
    distance_to_cluster_2 = [0 for x in range(0,number_topics)] # distances to cluster
    
    for document in documents: 
        distance_to_cluster = []   
        counter = 0
        for dim in range(0,number_topics): 
            distance_to_cluster.append(np.linalg.norm(document-dimensions[counter:counter+7]))
            counter = counter + 7
            
        best_distance_index = distance_to_cluster.index(min(distance_to_cluster)) 
        
        counter_docs[best_distance_index] += 1
    
        distance_to_cluster_2[best_distance_index] += pow(distance_to_cluster[best_distance_index], 2)
        
     
    return counter_docs, distance_to_cluster_2

In [None]:
def fobjQuadraticError(k):

    num_topics = int((len(k))/7)
    
    counter_documents, euclidean_dist_2 = auxiliarFunction(documents = matrix_algorithm, dimensions= k, number_topics = num_topics)
    
    bool_check = 0 in counter_documents    

    if(bool_check):
        total_value = 5 
        
    else:
        value1 = 0
        for i in range(0,num_topics): 
            value2 = euclidean_dist_2[i] 
            value1 += ((1/counter_documents[i])*value2)

        total_value = (value1 / num_topics) 

    return (total_value),

# Genetic Algorithm

## Folders

In [None]:
path = "./results/fobjQuadraticError"
try:
    if not os.path.isdir(path):
        os.makedirs(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)

## Save results 

In [None]:
if __name__ == "__main__":
    
    CLUSTER_DIM = 7 * NUM_CLUSTERS

    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMin)

    toolbox = base.Toolbox()

    toolbox.register("attr_flt", random.uniform, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_flt,n=CLUSTER_DIM)

    toolbox.register("population", tools.initRepeat, list, toolbox.individual)


    def customMutate(ind, indpb):
        for i in range(len(ind)):
            if random.random() < indpb:
                ind[i] = random.random()
        return ind


    def evalfobj(individual):
        return fobjQuadraticError(individual)

    def evalfobjInertia(individual):
        return fobjInertia(individual) 


    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", customMutate, indpb = 0.1)

    toolbox.register("select", tools.selTournament, tournsize=30)
    
    toolbox.register("evaluate", evalfobj)
    
    pool = multiprocessing.Pool(processes = 4)
    toolbox.register('map', pool.map)
    
    numberInd = 100
    
    random.seed(169)
    min_iterations = []
    max_iterations = []
    avg_iterations = []
    std_iterations = []
    ind_iterations = []
    
    CXPB, MUTPB = 0.8, 0.35
    
    pop = toolbox.population(n=numberInd)
    fitnesses = list(toolbox.map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit 
        
    fits = [ind.fitness.values[0] for ind in pop] 
    
    t = 0
    g = 0
    # Begin the evolution
    while g < 4000:
 
        start_time = time()
    
        # A new generation
        g = g + 1
        print("-- Generation %i --" % g)

        offspring = toolbox.select(pop, len(pop))

        offspring = list(map(toolbox.clone, offspring))

        for child1, child2 in zip(offspring[::2], offspring[1::2]): 
            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                del child1.fitness.values 
                del child2.fitness.values 
                
        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values  

        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
            
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
                        
        fit_maxi = 100000000000000000000
        individ = []
        for ind in pop:
            if (ind.fitness.values[0]<fit_maxi):
                fit_maxi = ind.fitness.values[0]
                individ = ind

        pop[:] = offspring

        
        fits = [ind.fitness.values[0] for ind in pop]
            
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        print("  Min %s" % min(fits))
  
        
        min_iterations.append(min(fits))
        max_iterations.append(max(fits))
        avg_iterations.append(mean)
        std_iterations.append(std)
        ind_iterations.append(individ)
        
        if (g % 200 == 0):
            path = "./results/fobjQuadraticError/iteration{}".format(g)
            try:
                if not os.path.isdir(path):
                    os.makedirs(path)
            except OSError:
                print ("Creation of the directory %s failed" % path)
            else:
                print ("Successfully created the directory %s " % path)
        
            print("-----------------------------------------------------------------------------------------")
            print("Guardamos valores")
            joblib.dump(min_iterations, "results/fobjQuadraticError/iteration{}/best_gen_fitness_iter{}.pkl".format(g, g))
            joblib.dump(max_iterations, "results/fobjQuadraticError/iteration{}/worst_gen_fitness_iter{}.pkl".format(g, g))
            joblib.dump(avg_iterations, "results/fobjQuadraticError/iteration{}/mean_gen_fitness_iter{}.pkl".format(g, g))
            joblib.dump(std_iterations, "results/fobjQuadraticError/iteration{}/std_gen_fitness_iter{}.pkl".format(g, g))
            joblib.dump(ind_iterations, "results/fobjQuadraticError/iteration{}/best_gen_individuals_iter{}.pkl".format(g, g))
            joblib.dump(pop[:], "results/fobjQuadraticError/iteration{}/last_pop{}.pkl".format(g, g))

            print('-----------------------------------------------------------------------------------------')

        print("--- {} s seconds to perform iteration number: {} ---".format((time() - start_time), g))
            
    #best = pop[np.argmin([toolbox.evaluate(x) for x in pop])]
    final_pop = pop[:]
    
    joblib.dump(min_iterations, "results/fobjQuadraticError/best_gen_fitness_iter{}.pkl".format(4000))
    joblib.dump(max_iterations, "results/fobjQuadraticError/worst_gen_fitness_iter{}.pkl".format(4000))
    joblib.dump(avg_iterations, "results/fobjQuadraticError/mean_gen_fitness_iter{}.pkl".format(4000))
    joblib.dump(std_iterations, "results/fobjQuadraticError/std_gen_fitness_iter{}.pkl".format(4000))
    joblib.dump(ind_iterations, "results/fobjQuadraticError/best_gen_individuals_iter{}.pkl".format(4000))
    joblib.dump(final_pop, "results/fobjQuadraticError/final_pop_iter{}.pkl".format(4000))
