In [16]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import random
from functools import reduce
import operator
import random

In [1]:
class CrossAdaptiveGA():
    
    def __init__(self, alpha, tournament_n, max_pob, cross_operators, mut_operators):
        
        self._train_data = None
        self._test_data = None
        self._target_node = None
        self._data_header = None
        
        self._init_universe = None
        self._poblation = None
        self._tournament_n = tournament_n
        self._cross_operators = cross_operators
        self._mut_operators = mut_operators
        self._alpha = alpha
        self._mut_prob = 0.05
        self._max_pob = max_pob        
        
        return
    
    
    def fit(self, train_data, test_data, init_universe, target_node):
        """
        Adapt data for analysis
        """
        x_heads = []
        for i in range(0,len(train_data.keys())):
            x_heads.append("Node"+str(i+1))
        train_data.columns = x_heads
        test_data.columns = x_heads
        self._train_data = train_data
        self._test_data = test_data
        self._init_universe = init_universe        
        self._target_node = target_node
        self._data_header = x_heads
        return
    
    
    def __cross(self, p1, p2):
        # select random operator
        rand_index = random.randint(0, len(self._cross_operators)-1)
        
        return self._cross_operators[rand_index](p1['set'], p2['set'])
    
    
    def __mutate(self, p):
        # select random operator
        rand_index = random.randint(0, len(self._mut_operators)-1)
        
        return self._mut_operators[rand_index](p)
    
    
    def __tournament(self):
        """
        Select randomly tournament_n sets from the poblation,
        and select the fittest one
        must return a pair in order to not repeat
        """
        poblation_copy = list(self._poblation)
        parents = []
        for k in range(0,2):
            selected_nodes_index = random.sample(range(0,len(poblation_copy)-1),5)
            max_fitness = 0
            max_fitness_index = -1
            for i in range(0,self._tournament_n):
                if (max_fitness<poblation_copy[selected_nodes_index[i]]['fitness']):
                    max_fitness = poblation_copy[selected_nodes_index[i]]['fitness']
                    max_fitness_index = selected_nodes_index[i]
            parents.append(poblation_copy.pop(max_fitness_index))
        return parents
    
    
    def __generate_individual(self):
        """
        Will generate an individual in canonical form
        using a vector of size len(universe) with 1 and 0
        indicating the presence or absence of a node in the set
        """
        individual = []
        for i in range(0,len(self._data_header)):
            if i in self._init_universe:
                individual.append(random.randint(0,1))
            else:
                individual.append(0)
        
        return individual
    
    def __generate_poblation(self, quantity):
        """
        Generate a pob of size equal to quantity,
        all in canonical form
        """
        new_poblation = []
        for i in range(0,quantity):
            new_poblation.append({'set':self.__generate_individual(),
                                  'fitness':-1})
        
        return new_poblation
    
    
    def __convert_individual_set(self, i_set):
        converted_set = []
        for i in range(0,len(i_set)):
            if i_set[i] == 1:
                converted_set.append('Node'+str(i+1))
        
        return converted_set
    
    
    def __evaluate_individual(self, individual):
        """
        Will evaluate the individual using naive bayes model,
        converting the canonical set into a header structure
        for pandas to read on
        """
        individual_set = self.__convert_individual_set(individual['set'])
        if (len(individual_set)==0):
            return 0.0
        
        
        X_train = self._train_data
        Y_train = self._train_data[self._target_node]
        clf = GaussianNB()
        clf.fit(X_train[individual_set],Y_train)
        
        precisions = []
        values_counter=set(self._train_data[self._target_node])
        for val in values_counter:
            if (len(self._test_data.query( self._target_node+'== '+str(val)))==0): 
                precisions.append(1)
                continue
            Y_pred_c = clf.predict(self._test_data.query(self._target_node+'== '+str(val))[individual_set])
            precisions.append(Y_pred_c.tolist().count(val)/len(Y_pred_c))
        
        return reduce(operator.mul,precisions,1)**(1/len(precisions)) # calculating g-mean
    
    
    def __evaluate_poblation(self, poblation):
        """
        Evaluating each set with Naive Bayes model,
        returning fitness value into fitness entry
        """
        
        for i in range(0,len(poblation)):
            poblation[i]['fitness'] = self.__evaluate_individual(poblation[i])
            
        return poblation
    
    def __mutate(self, merged_pob):
        mutations = []
        for x in merged_pob:
            if (random.random()<self._mut_prob):
                print("Mutation!")
                m_dict = {
                    'set':self._mut_operators[random.randint(0,len(self._mut_operators)-1)](x['set']),
                    'fitness': -1
                }
                mutations.append(m_dict)
        
        return mutations

In [102]:
class CrossAdaptiveGA(CrossAdaptiveGA):
    
    def run(self, max_iter):
        """
        train data, test data and initial universe must be set before hand
        """
        
        average_change_rate = 1
        past_average_score = 0
        max_score = 0
        unchanged_rate = 0
        wild_fox = 0
        self._mut_prob = 0.05
        self._poblation = self.__generate_poblation(self._max_pob)
        while (True):
            if (average_change_rate>=0 and average_change_rate < self._alpha):
                unchanged_rate += 1
                # Adaptive mutation probability
                self._mut_prob += unchanged_rate*0.05
            else:
                unchanged_rate = 0
                
            if (unchanged_rate>=3):
                unchanged_rate = 0
                wild_fox +=1
                self._poblation = [self._poblation[0]] + self.__generate_poblation(self._max_pob-1)
                print("Poblation has been erased!")
                self._mut_prob = 0.05
                print("Mutation probability has been reseted!")
                
            if wild_fox >=3:
                break
                
            if (average_change_rate == 1):
                average_change_rate == 0 # to emulate do while
            """
            Beginning of tournament
            """
            # will make quarter of poblation pairs in order to
            # generate sons of half the quantity of pob
            self._poblation = self.__evaluate_poblation(self._poblation)
            parents = []
            for p_counter in range(0,int(self._max_pob/4)):
                parents.append(self.__tournament())
            sons = []
            """
            End of tournament
            """
            
            """
            Beginning of crossing and mutation
            """
            for s_counter in range(0,len(parents)):
                sons = sons + self.__cross(parents[s_counter][0],parents[s_counter][1])
            sons = [x for x in sons if 1 in set(x['set'])]
            sons = self.__evaluate_poblation(sons)
            
            # merge poblation
            merged_poblation = self._poblation + sons
            mutations = self.__mutate(merged_poblation)
            if (len(mutations)>0):
                mutations = self.__evaluate_poblation(mutations)
                merged_poblation = merged_poblation + mutations
            
            # sort poblation
            merged_poblation = sorted(merged_poblation, key=operator.itemgetter('fitness'), reverse=True)
            
            # saving results
            merged_poblation = merged_poblation[:self._max_pob]
            actual_average_score = sum([x['fitness'] for x in merged_poblation])
            average_change_rate = (- past_average_score + actual_average_score)/len(merged_poblation)
            past_average_score = actual_average_score
            max_score = merged_poblation[0]['fitness']
            self._poblation = merged_poblation
            print("Poblation change rate: "+str(average_change_rate))
            print("Max score: "+str(max_score))
            if (max_score == 1):
                break
        
        return self._poblation[0] # returning best individual
            

In [67]:
def basic_cross_sum(p1,p2):
    result = []
    result_c = []
    for i in range(0,len(p1)):
        if (p1[i]==0 and p2[i]==0):
            result.append(0)
            result_c.append(0)
        else:
            if (p1[i]+p2[i] == 1):
                result.append(1)
                result_c.append(0)
            else:
                result.append(0)
                result_c.append(1)
    sons = []
    sons.append({'set':result,
                  'fitness':-1})
    sons.append({'set':result_c,
                  'fitness':-1})
    
    return sons

In [50]:
def basic_cross_single_point(p1,p2):
    single_point = random.randint(0,len(p1)-1)
    sons = []
    sons.append({
        'set':p1[0:single_point]+p2[single_point:],
        'fitness':-1,
    })
    sons.append({
        'set':p2[0:single_point]+p1[single_point:],
        'fitness':-1,
    })
    
    return sons

In [28]:
def basic_cross_multi_point(p1,p2):
    sons = []
    son_1 = []
    son_2 = []
    for i in range(0,len(p1)):
        coin_throw = random.randint(0,1)
        if (coin_throw == 0):
            son_1.append(p1[i])
            son_2.append(p2[i])
        else:
            son_1.append(p2[i])
            son_2.append(p1[i])
    sons.append({
        'set':son_1,
        'fitness':-1,
    })
    sons.append({
        'set':son_2,
        'fitness':-1,
    })
    
    return sons

In [29]:
def basic_bit_mutation(p):
    for i in range(0, len(p)):
        if (p[i]==0):
            coin_throw = random.randint(0,1)
            if (coin_throw == 1):
                p[i] == 1
                return p
    return p

In [133]:
ga_model = CrossAdaptiveGA(0.05,5,40,[basic_cross_sum,basic_cross_single_point,basic_cross_multi_point],[basic_bit_mutation])

In [36]:
X_train = pd.read_csv("/home/a20114261/sdelrio/alarm_datasets/Alarm10/Alarm10_s5000_v8.txt",delimiter='  ',header=None, engine='python')
X_test = pd.read_csv("/home/a20114261/sdelrio/alarm_datasets/Alarm10/Alarm10_s5000_v9.txt",delimiter='  ',header=None, engine='python')


In [125]:
TargetNodeIndex = 20
init_pob = [x for x in random.sample(range(15,85),15) if x != TargetNodeIndex]

In [134]:

ga_model.fit(X_train,X_test, init_pob ,'Node'+str(TargetNodeIndex+1))

In [136]:
#%%time
set_result = ga_model.run(10)
print("Fitness was: "+str(set_result['fitness']))
for i in range(0,len(set_result['set'])):
    if (set_result['set'][i]==1):
        print("Node"+str(i+1))

Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Poblation change rate: 0.26702685215606653
Max score: 0.3783053988504781
Mutation!
Poblation change rate: 0.07091291883642267
Max score: 0.4318414751866778
Mutation!
Mutation!
Mutation!
Poblation change rate: 0.03076609574547353
Max score: 0.4318414751866778
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Poblation change rate: 0.01910169191090949
Max score: 0.4318414751866778
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Poblation change rate: 0.023573162000544733
Max score: 0.4372043650846669
Poblation has been erased!
Mutation probability has been reseted!
Mutation!
Poblation change rate: -0.13860866623601115
Max score: 0.44539177576494443
Mutation!
Mutation!
Poblation change rate: 0.0961709430207894
Max score: 0.44539177576494443
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Mutation!
Poblation 