In [1]:
import pandas as pd;
import random;
from statistics import fmean, stdev;
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import DistanceMetric
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor

from itertools import combinations


OBJECTS/VARIABLES:

Antibody - list of Numerical values (Reals?) , represented by a row of a dataframe?
Population - List[Antibodies], represented by the total dataframe?

Target Size (i.e number of antibodies to be created) = (Size of Majority Class) - (Size of minority class)



FUNCTIONS:

Initializing :- input: original dataframe

                         do: Get bounds of minority class by taking the highest and lowest values in each of the [n] dimensions

                         How: Do we take the whole minority class? or do we sample part or parts of it to generate our bounds. 

                         output: Upper and lower bounds of the minority class


Creation :- Input: Bounds of the min class

        do: Create a set of antibodies

        input: minority Dataframe 

        How:
            Possibilities:
            (As the Malhanabois paper does it) Take a random value between the bounds of the minority class feature as the datapoint

            (Nikhil Just sample minority class based on imbalance rate (doesn't require bounds)

            (Adam) Take a random value, as in the paper's method, but off of a weighted curve? as in we could randomize, 
                but add some preference for values close to the boundary, or close to the center, etc.
                        -We could set this as a parameter, bell curve, linear. This same function could be a parameter in the mutation stage.
                        -If the density within the bounds is concentrated on one side, add bias towards that side in the random value;
        
        Challenges: 
            How do we deal with the different data categories (e.g nominal, ordinal, and continuous)? Continuous is easy, just a number in a range. Ordinal is ???, nominal is difficult, even if one-hot encoded, we might random to have two values that should be exclusive (e.g an item being both blue and red). How do other imputation algorithms work with these problems? DO they even work with these problems?
        output: Initial Population as a DF?



**Initialization**

In [2]:

df = pd.read_csv("./Data/GeneratedSyntheticData-NosielessInformativeEasy.csv")

columns = df.columns.to_list()
columns_drop = columns.pop(-1)

#drop NaN rows, could implement imputer as well
df.dropna(inplace=True)

labels = df.drop(columns, axis=1)

#df= df.drop("5", axis=1)
#df= df.drop("Unnamed: 0", axis=1)

df

Unnamed: 0,0,1,2,3,4,5
0,-4.539317,5.021729,-7.042357,-7.428090,4.943687,0.0
1,4.300671,6.131065,-6.394796,-5.981102,4.579452,1.0
2,-5.004336,4.473384,-6.048647,-5.556982,4.051438,0.0
3,2.732848,7.791876,-7.495527,-6.541127,3.910933,1.0
4,-4.193859,5.393440,-5.317542,-5.605152,4.807354,0.0
...,...,...,...,...,...,...
295,5.349610,4.226061,-4.841005,-6.342083,4.820905,1.0
296,-5.247255,5.149173,-6.669314,-5.848687,3.793496,0.0
297,3.308685,5.513754,-3.805559,-5.564981,4.670373,1.0
298,-4.510472,4.712267,-4.550419,-3.915542,4.798225,0.0


In [3]:
#TODO Gaussian generation can be optimized by altering how the loops work, putting the col loop on the outside and pre-generated fmean, stdev for the col
        #Currently it generates those values for every antibody
        

def get_bounds(minorityDF) -> dict:
    out = {}
    for col in minorityDF:
        colMax = minorityDF[col].max()
        colMin = minorityDF[col].min()
        out[col] = (colMin, colMax)
    return out

#This only works for continuous values. We will have to code a version for binary fields (We assume any categorical columns have been encoded)

####### Creation ################
# minorityDF - dataframe containing the minority class
# totalPopulation - The total number of antibodies to create
# weightingFunction - Can choose between uniform, triangular, ...
# mode - for use with a triangular function - set to the percentage of the range you wish to be most represented (between 0.0 and 1.0)
def Creation(minorityDF, totalPopulation : int, binaryColumns : list, weightingFunction : str = "uniform", mode : float = 0.5): 
    
    if(minorityDF.isnull().values.any()):
        raise ValueError("Minority Class DataFrame contains NaN")
    
    population = [] #Initializing the empty population
    if mode < 0.0 or mode > 1:
        raise Exception("mode must be between float value between 0.0 and 1.0")
    
    if weightingFunction not in ('uniform', 'triangular', 'gaussian'):
        raise Exception("Unknown function chosen, please use one of 'uniform', 'triangular', or 'gaussian'")

    bounds = get_bounds(minorityDF)
    
    if weightingFunction in ["uniform", "triangular"]: #If Generating via uniform or triangular distribution, loop through bounds of columns
        for i in range(totalPopulation): #For every antibody to be created

            antibody = [] #Initializing a single antibody
            for key,bnd in bounds.items(): #Iterate through the columns/dimensions/features of the minority class for each antibody 
                if key in binaryColumns:
                    antibody += [random.randint(int(bnd[0]),int(bnd[1]))]
                else:
                    if weightingFunction == "uniform":
                        antibody += [round(random.uniform(bnd[0],bnd[1]),4)] #Add a random value between the lower and upper bounds to the antibody

                    elif (weightingFunction == "triangular"):
                        
                        tri_tip = ( ((bnd[1]-bnd[0]) * mode) + bnd[0] ) #multiplying the difference by the percentage, plus the low bound gives us the point between the two, but percentile

                        if tri_tip < bnd[0]: #Error checks to make sure that the emphasized point isn't outside the bounds
                            tri_tip = bnd[0]
                        elif tri_tip > bnd[1]:
                            tri_tip = bnd[1]

                        antibody += [round( random.triangular(bnd[0],bnd[1], tri_tip), 5)]

            population+=[antibody] #add the created antibody to the population

    elif (weightingFunction == 'gaussian'): #If Generating via Gaussian, loop through columns of dataframe

        for i in range(totalPopulation): #For every antibody to be created

            antibody = [] #Initializing a single antibody
            for bnd in minorityDF: #Iterate over columns in the dataframe
                values = minorityDF[bnd].tolist() #convert series to list
                if bnd in binaryColumns:
                    antibody += [random.randint(bounds[bnd][0],bounds[bnd][1])]
                else:
                    antibody += [round(random.gauss(fmean(values) , stdev(values)), 5)] #using median and stdeviation of values, radomize over gauss

        
            population+=[antibody] #add the created antibody to the population

            
    popDF = pd.DataFrame(population, columns = minorityDF.columns.values)
    return popDF, bounds
    

#Creation(df,1000,['5'], weightingFunction='gaussian')


Function For Experiments with bounds

**Fitness Function**

Requirements: Needs to be calced fast bc of multiple iterations
Posiibilities: - Binary Classification F1 Score, Mahalanobis Distance?
               - Other Types as well? : Linear Regression, Multiilabel Classification

Do we just impute our values and then do something similar to StudentPerformance and see what happens? No bc we need input from the fitness function to do our generations.

Is the data just our training set?
Inputs: Model(initialized outside function or inside?) fit with data that has been encoded and the label

Want to do kfold cv (not every loop bc very slow, once afterwards to evaluate)

when we do k fold, call fitness funciton k times i.e. once for every train test split.

if doing grid search, do it before calling this?

In [4]:
# calculates the fitness score for one train/test split dataset
# run on original dataset without random values first to be abe to compare

def fitness( model, feat, label, iterations, scorer):
    #scorer is the name of the function wee aree using to evaluate our dataset
    #it should be a function with signature scorer(model, feature, label) which should return only a single value.
    return cross_val_score(model, feat, label, cv = iterations, scoring = scorer)

def distance( x, y, metric):
    
    #get the distance between two sets of data x and y, they should be the same size
    #metric is the string metric to be used to measure distance

    dist = DistanceMetric.get_metric(metric)
    return dist.pairwise(x,y)
    
#Original features, original labels are the original df before any oversampling
#Population_features, population_labels are the generated population we want to evaluate
#Here scorer has to be a function that takes y_pred, y_true and returns a score, not implemented yet
def fitnessBasic(model, original_features, original_labels, population_features, population_labels, scorer):

    #train test split makes train set smaller, we should sample the population based on he difference of the majority class and minority class in origin_feat_train
    origin_feat_train, origin_feat_test, origin_labels_train, origin_labels_test = train_test_split(original_features, original_labels, test_size=0.33)
    
    train_features = pd.concat([origin_feat_train, population_features],ignore_index=True)
    train_labels = pd.concat([origin_labels_train, population_labels],ignore_index=True)

    model.fit(train_features, train_labels.values.ravel())
    predictions = model.predict(origin_feat_test)

    #need more params?
    #hard coded f1_score, find a way to pass in function for scoring?
    score = f1_score(origin_labels_test.values.ravel(), predictions)
    return score

#Original features, original labels are the original df before any oversampling
#Population_features, population_labels are the generated population we want to evaluate
#Here scorer has to be a function that takes y_pred, y_true and returns a score, not implemented yet
def fitnessCV(model, original_features, original_labels, population_features, population_labels, scorer, iterations):
    #TODO: train_features or train_labels had 1 extra row, need to fix
    #train test split makes train set smaller, we should sample the population based on he difference of the majority class and minority class in origin_feat_train
    origin_feat_train, origin_feat_test, origin_labels_train, origin_labels_test = train_test_split(original_features, original_labels, test_size=0.33)
    
    train_features = pd.concat([origin_feat_train, population_features],ignore_index=True)
    train_labels = pd.concat([origin_labels_train, population_labels],ignore_index=True)

    #look into group parameter of cross_validate
    #here scoring can be multiple values
    
    cval_scores = cross_validate(model, train_features, train_labels.values.ravel(), scoring = scorer, cv = iterations, return_train_score = True, return_estimator = True)

    #look at format of scores, get estimators and use them to predict test

    test_scores = []
    cval_test_scores =cval_scores['test_score']
    count = 0 
    for estimator in cval_scores['estimator']:
        
        estimator.fit(train_features, train_labels.values.ravel())
        predictions = estimator.predict(origin_feat_test)

        #hard coded f1_score, find a way to pass in function for scoring?
        score = f1_score(origin_labels_test, predictions) 

        #here I just took the mean of the 2 scores, could we use something else?
        mean_score = (score + cval_test_scores[count])/2
        count+=1
        test_scores.append(mean_score)
    
    #here I just took the mean of the array of all scores, could we use something else?
    return fmean(test_scores)

    
    

**Mutation**

In [5]:
####### Mutation ################
def mutatePopulation (antiPopulation, bounds, binaryColumns : list, mutationRate : float = 1.0):
    #antiPopulation is the population of antibodies to be mutated
    #bounds is a dictionary of the bounds of each column in the population
    #binaryColumns is a list of the columns that are binary
    #mutationRate denotes how much the antibodies can mutate each round, 1.0 is the default, 0.0 is no mutation, 2.0 is double mutation rate
    #returns a new mutated population of antibodies
    antiPopulation = antiPopulation.copy()
    for col in antiPopulation:
        if bounds[col][0] == bounds[col][1]:
            continue
        elif col in binaryColumns: #Binary Columns must be handled differently than continuous
            
            antiPopulation[col] = antiPopulation[col].map(lambda x : (random.randint(0,1)))
        else:
            bnd_range = (bounds[col][1] - bounds[col][0])*mutationRate #total range of bounds is high - low

            #Setting the low and high bounds to be centered around 0
            hi_bnd = bnd_range/2 
            low_bnd = (0-bnd_range/2)

            #print("Low bound around 0 = " + str(low_bnd) +"| Hi bnd around 0 = "+ str(hi_bnd))
            #print(round(random.uniform(low_bnd,hi_bnd),4))

            antiPopulation[col] = antiPopulation[col].map(lambda x : x+round(random.uniform(low_bnd,hi_bnd),4))
        
    return antiPopulation

#First round: Compare the base dataset, to the dataset+the created/mutated points
#

def comparePopulations(population1, population2, labels1, labels2, estimator, iterations, scorer):

    score1 = fmean(fitness(estimator, population1, labels1.values.ravel(), iterations, scorer))
    score2 = fmean(fitness(estimator, population2, labels2.values.ravel(), iterations, scorer))

    if score1 > score2:
        winning_population = population1
        winning_labels = labels1
    else:
        winning_population = population2
        winning_labels = labels2

    for col in winning_labels:
        winning_population = winning_population.join(winning_labels[col])

    return winning_population

def comparePopulations2(population1, population2, labels1, labels2, estimator, iterations, scorer):
    score1 = fmean(fitness(estimator, population1, labels1.values.ravel(), iterations, scorer))
    score2 = fmean(fitness(estimator, population2, labels2.values.ravel(), iterations, scorer))
    
    print("score1: " +str(score1))
    print("score2: " +str(score2))

    if abs(score1 - score2) < 0.005:
        return False
    elif (score1>score2):
        return False
    else:
        return True

#takes in the previous population's score, will need to add variable in AIS to track this from previous round
# original features and original labels are the original df split into features and labels
# population features and population labels are the population df split into features and labels, this is the new population we mutated this round
# estimator, iterations, scorer not changed from old compare populaitons
def comparePopulationsCV(prev_score, original_features, original_labels, population_features, population_labels, estimator, iterations, scorer, min_change = 0.005):
    score1 = prev_score
    score2 = fitnessCV(estimator, original_features, original_labels, population_features, population_labels, scorer, iterations)
    
    # print("score1: " +str(score1))
    # print("score2: " +str(score2))

    #is 0.005 too big?
    if abs(score1 - score2) < 0.005:
        return False, score1
    elif (score1>score2):
        return False, score1
    else:
        return True, score2

#need a comparePopulationsBasic for fitnessBasic


In [6]:
#do we need this?
create1 = Creation(df,10000,['5'], weightingFunction='uniform')
create2 = Creation(df,10000,['5'], weightingFunction='uniform')

pop1 = create1[0]
pop2 = create2[0]
bounds1 = create1[1]
bounds2 = create2[1]
columns1 = pop1.columns.to_list()
columns1_drop = columns1.pop(-1)

labels1 = pop1.drop(columns1, axis=1)
pop1 = pop1.drop(columns1_drop, axis=1)

columns2 = pop2.columns.to_list()
columns2_drop = columns2.pop(-1)

labels2 = pop2.drop(columns2, axis=1)
pop2 = pop2.drop(columns2_drop, axis=1)

In [7]:
mutatePopulation(labels1, bounds1, ['5'])


Unnamed: 0,5
0,0
1,1
2,1
3,1
4,0
...,...
9995,1
9996,0
9997,0
9998,0


In [8]:
#TODO : add parameter that defines which column is the label
#separate a df into features and labels
def separate_df(df, label_col):

    columns = df.columns.to_list()
    columns_drop = columns.pop(columns.index(label_col))
    labels = df.drop(columns, axis=1)
    features = df.drop(columns_drop, axis=1)

    return features, labels

joe = pd.read_csv("./Data/GeneratedSyntheticData-NosielessInformativeEasy.csv")
feat,labels = separate_df(joe, '5')
feat

Unnamed: 0,0,1,2,3,4
0,-4.539317,5.021729,-7.042357,-7.428090,4.943687
1,4.300671,6.131065,-6.394796,-5.981102,4.579452
2,-5.004336,4.473384,-6.048647,-5.556982,4.051438
3,2.732848,7.791876,-7.495527,-6.541127,3.910933
4,-4.193859,5.393440,-5.317542,-5.605152,4.807354
...,...,...,...,...,...
295,5.349610,4.226061,-4.841005,-6.342083,4.820905
296,-5.247255,5.149173,-6.669314,-5.848687,3.793496
297,3.308685,5.513754,-3.805559,-5.564981,4.670373
298,-4.510472,4.712267,-4.550419,-3.915542,4.798225


In [9]:
def getBinaryColumns(df) -> list:
        return list(df.columns[df.nunique() == 2])

In [10]:
def extractBinaryMinorityClass( preparedFeatures, labels) -> pd.DataFrame:
        #preparedFeatures is the dataframe of features, labels is the dataframe of labels
        #returns a dataframe of the minority class
        #get counts of each class from labels
        for col in labels:
                counts = labels[col].value_counts()
                #get the minority class
                minorityLabel = counts.idxmin()

        minorityClass = labels[labels == minorityLabel]
        minorityClass = minorityClass.dropna()
        minorityClass = minorityClass.index.values
        minorityClass = preparedFeatures.loc[minorityClass]
        minorityClass[labels.columns[0]]=minorityLabel
        return minorityClass

In [11]:
#takes a population, generates its LOF score, ranks the data by it and splits it into n_blocks groups of similar data
def lof(original_df, population, n_neighbor:int = 20, n_blocks:int = 4):

    size = len(original_df.index)

    df = pd.concat([original_df,population],ignore_index=True)
    lof = LocalOutlierFactor(n_neighbors = n_neighbor)
    
    y_pred = lof.fit_predict(df)
    X_scores = lof.negative_outlier_factor_

    df["lof"]=X_scores
    population_with_lof = population.copy()
    population_with_lof["lof"] = X_scores[size:]

    population_with_lof = population_with_lof.sort_values(by = ['lof'], ignore_index=True)
    population_with_lof = population_with_lof.drop(columns=['lof'])

    sizeof_block = int(len(population_with_lof.index)/n_blocks)
    i = 0 
    j = int(0)
    result = []
    
    while(i < n_blocks):
        k = int(j+ sizeof_block)
        p = population_with_lof.iloc[j:k]
        result.append(p)
        #result.append(population[j:k])
        j+=sizeof_block
        i+=1
    

    return result

def get_best_population(df, original_features, original_labels, antibody_population, previous_result, label, model, K_folds, scorer):
    
    result = lof(df, antibody_population)
    
    
    p1 = pd.concat([result[0],result[1],result[2],previous_result[3]],ignore_index=True)
    p1_features, p1_labels = separate_df(p1, label_col=label)
    p1_score = fitnessCV(model, original_features, original_labels, p1_features, p1_labels, scorer, K_folds)

    p2 = pd.concat([result[0],previous_result[1],result[2],result[3]],ignore_index=True)
    p2_features, p2_labels = separate_df(p2, label_col=label)
    p2_score = fitnessCV(model, original_features, original_labels, p2_features, p2_labels, scorer, K_folds)

    p3 = pd.concat([result[0],result[1],previous_result[2],result[3]],ignore_index=True)
    p3_features, p3_labels = separate_df(p3, label_col=label)
    p3_score = fitnessCV(model, original_features, original_labels, p3_features, p3_labels, scorer, K_folds)

    p4 = pd.concat([previous_result[0],result[1],result[2],result[3]],ignore_index=True)
    p4_features, p4_labels = separate_df(p4, label_col=label)
    p4_score = fitnessCV(model, original_features, original_labels, p4_features, p4_labels, scorer, K_folds)

    scores = [p1_score,p2_score,p3_score,p4_score]
    max_score = max(scores)

    if(max_score == p1_score):
        return p1, p1_score
        
    if(max_score == p2_score):
        return p2, p2_score

    if(max_score == p3_score):
        return p3, p3_score
    
    if(max_score == p4_score):
        return p4, p4_score

def comparePopulations_lof( population_score, old_score, min_change):
    print("old_score: " +str(old_score))
    print("population_score: " +str(population_score))
    if abs(population_score - old_score) < min_change:
        return False, old_score
    elif (old_score > population_score):
        return False, old_score
    else:
        return True, population_score
    

In [12]:
#minorityDF      - the minority dataframe
#df              - the original dataframe
#max_rounds      - the maximum number of rounds(loops) of AIS 
#stopping_cond   - the number of rounds without significant changes to accuracy before stopping the function
#totalPopulation - the number of elements we want to add to the minority class
#model           - the model to be used to evaluate the dataset during AIS
#K-folds         - the number of segments for k-fold cross validation
#scorer          - the scoring metric when evaluating the dataset

def AIS(minorityDF,df, label, max_rounds, stopping_cond, totalPopulation, model, K_folds, scorer,  min_change = 0.05, use_lof : bool = True):

        #add code to find binary columns for creation
        binaryColumns = getBinaryColumns(minorityDF)

        current_population, bounds = Creation(minorityDF,totalPopulation,binaryColumns, weightingFunction='uniform')
        
        antibody_population = mutatePopulation(current_population,bounds,binaryColumns)
        
        count = 0
        no_change = 0

        original_gen, original_labels = separate_df(df, label)
        #created population split into features and labels
        current_gen, current_labels = separate_df(current_population, label_col=label)

        
        current_score = fitnessCV(model, original_gen, original_labels, current_gen, current_labels, scorer, K_folds)

        # #the next generation antibody population concatenated to the original dataframe
        # next_df = pd.concat([df,antibody_population],ignore_index=True) #TODO:REMOVE
        #next_df split into features and labels
        next_gen, next_labels = separate_df(antibody_population, label_col=label)

        if(use_lof==False):
            while( (count < max_rounds) and (no_change < stopping_cond) ):
                count+=1
         
                change_flg, score = comparePopulationsCV(current_score, original_gen, original_labels, next_gen, next_labels, model, K_folds, scorer, min_change)
                if (change_flg):
                    
                    no_change = 0

                    current_population = antibody_population.copy()

                    #need to update bounds
                    bounds = get_bounds(current_population)
                    antibody_population = mutatePopulation(current_population,bounds,['5'])
                    next_gen, next_labels = separate_df(antibody_population, label_col=label)
                    
                else:
                 
                    no_change+=1

                    bounds = get_bounds(current_population)
                    antibody_population = mutatePopulation(current_population,bounds,['5'])
                    next_gen, next_labels = separate_df(antibody_population, label_col=label)
                    
                current_score = score #Score will only change if the new population is better than the old population
        
        else:
            current_population_lof = lof(df, current_population)
            while( (count < max_rounds) and (no_change < stopping_cond) ):

                count+=1
                
                best_population, best_population_score = get_best_population(df, original_gen, original_labels, antibody_population, current_population_lof, label, model, K_folds, scorer)
                
                change_flg, score = comparePopulations_lof(best_population_score, current_score, min_change)
                if (change_flg):

                    no_change = 0

                    current_population = best_population.copy()
                    current_population_lof = lof(df, current_population)

                    #need to update bounds
                    bounds = get_bounds(current_population)
                    antibody_population = mutatePopulation(current_population,bounds,['5'])
                    
                    
                else:

                    no_change+=1

                    bounds = get_bounds(current_population)
                    
                    antibody_population = mutatePopulation(current_population,bounds,['5'])
                    
                current_score = score #Score will only change if the new population is better than the old population

        return current_population, count

In [13]:
def AIS_Resample( preparedDF, labels, max_rounds, stopping_cond, model, K_folds, scorer):
        #preparedDF is the dataframe of features, labels is the dataframe of labels
        minorityDF = extractBinaryMinorityClass(preparedDF, labels)
        
        #PreparedDF + Labels = the overall Population
        overallPopulation = pd.concat([preparedDF,labels],axis=1)
        #The number of elements we want to add to the minority class
        requiredPopulation = len(overallPopulation) - (len(minorityDF)*2)
        
        oversamples,_ = AIS(minorityDF,overallPopulation,labels.columns, max_rounds,stopping_cond,requiredPopulation,model,K_folds,scorer)
        concatDF = pd.concat([overallPopulation,oversamples],ignore_index=True)
        return (separate_df(concatDF, labels.columns[0]))

In [14]:
#TEST BLOCK 

#not imbalanced
#df= pd.read_csv("./Data/GeneratedSyntheticData-NosiyEasy.csv",index_col=0)

#Gets a perfect score always
#df= pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeEasy.csv") 

#Gets a perfect score after a few iterations
df = pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeHard.csv",index_col=0)
df.dropna()

#not imbalanced
#df = pd.read_csv("./Data/GeneratedSyntheticData-NosiyHard.csv",index_col=0)

minority = df[df['5']==1] 
majority = df[df['5']==0]

data, labels = separate_df(df, '5')
# initial_population, bounds = Creation(minority,120,['5'], weightingFunction='uniform')

# antibody_population = mutatePopulation(initial_population,bounds,['5'],1.0)

# current_df = pd.concat([df,initial_population],ignore_index=True)
# current_gen, current_labels = separate_df(current_df, '5')


# next_df = pd.concat([df,antibody_population],ignore_index=True)
# next_gen, next_labels = separate_df(next_df, '5')

randomForest = RandomForestClassifier()

# AIS_Resample(data, labels, 20, 5, randomForest, 5, 'f1')

x = AIS_Resample(data, labels, 20, 10, randomForest, 5, 'f1')

old_score: 0.8706687866580755
population_score: 0.877491266054345
old_score: 0.8706687866580755
population_score: 0.8832231192587635
old_score: 0.8706687866580755
population_score: 0.8694760080804285
old_score: 0.8706687866580755
population_score: 0.8823306071730219
old_score: 0.8706687866580755
population_score: 0.8844627216565139
old_score: 0.8706687866580755
population_score: 0.8703288631769308
old_score: 0.8706687866580755
population_score: 0.8742719687862461
old_score: 0.8706687866580755
population_score: 0.9044573153926487
old_score: 0.8706687866580755
population_score: 0.886414671518079
old_score: 0.8706687866580755
population_score: 0.8669301833335605


Testing datasets with no oversampling

In [15]:
#df = pd.read_csv("./Data/GeneratedSyntheticData-NosiyHard.csv",index_col=0)
df= pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeHard.csv") 

minority = df[df['5']==1] 
majority = df[df['5']==0]

joe, mama = separate_df(df)
score1 = fitness(randomForest, joe, mama.values.ravel(), 5, 'f1_macro')
score1

TypeError: separate_df() missing 1 required positional argument: 'label_col'

Testing new fitness functions

In [16]:
df= pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeHard.csv",index_col=0)

randomForest = RandomForestClassifier()

minority = df[df['5']==1] 
majority = df[df['5']==0]


initial_population, bounds = Creation(minority,120,['5'], weightingFunction='uniform')

antibody_population = mutatePopulation(initial_population,bounds,['5'],1.0)


current_gen, current_labels = separate_df(initial_population)

next_gen, next_labels = separate_df(antibody_population)

original_feat, original_label = separate_df(df)



#x = fitness2(randomForest, original_feat, original_label, next_gen, next_labels, "not_implemented_yet")

scores = fitnessCV(randomForest, original_feat, original_label, next_gen, next_labels, 'f1', 5)
scores


TypeError: separate_df() missing 1 required positional argument: 'label_col'

Trying out LOF

In [17]:
df= pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeHard.csv",index_col=0) 

size = len(df.index)
minority = df[df['5']==1] 
majority = df[df['5']==0]


initial_population, bounds = Creation(minority,120,['5'], weightingFunction='uniform')

antibody_population = mutatePopulation(initial_population,bounds,['5'],1.0)

current_df = pd.concat([df,initial_population],ignore_index=True)
current_gen, current_labels = separate_df(current_df)


next_df = pd.concat([df,antibody_population],ignore_index=True)
next_gen, next_labels = separate_df(next_df)

clf = LocalOutlierFactor(n_neighbors=20)

y_pred = clf.fit_predict(next_df)
#n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_

outliers = (X_scores < -2).sum()
outliers
next_df["lof"]=X_scores

#use size of original df

antibody_population["lof"] = X_scores[size:]
#print(antibody_population)
#print(next_df[300:])
#sorted_df = next_df.sort_values(by = ['lof'], ignore_index=True)
#sorted_df
sorted_pop = antibody_population.sort_values(by = ['lof'], ignore_index=True)
sorted_pop



TypeError: separate_df() missing 1 required positional argument: 'label_col'

TEST LOF


In [None]:
df= pd.read_csv("./Data/GeneratedSyntheticData-NosieLessInformativeHard.csv",index_col=0) 

size = len(df.index)
minority = df[df['5']==1] 
majority = df[df['5']==0]

initial_population, bounds = Creation(minority,120,['5'], weightingFunction='uniform')

antibody_population = mutatePopulation(initial_population,bounds,['5'],1.0)

current_df = pd.concat([df,initial_population],ignore_index=True)
current_gen, current_labels = separate_df(current_df,'5')

next_df = pd.concat([df,antibody_population],ignore_index=True)
next_gen, next_labels = separate_df(next_df,'5')

original_gen, original_labels = separate_df(df,'5')

previous = lof(df, initial_population)
randomForest = RandomForestClassifier()
rizz = get_best_population(df, original_gen, original_labels, antibody_population, previous, original_labels.columns, randomForest, 5, 'f1' )
rizz
    

(          0       1       2       3       4  5
 0   -0.8432 -3.5282 -0.0501  2.5187 -0.5025  1
 1    1.2280  2.2951  3.2678 -3.2986  0.8151  1
 2    1.7260 -2.4066  3.8643 -3.2133  1.7390  1
 3    1.4017 -0.2919  4.6156 -2.6526 -0.3847  1
 4    0.4966 -1.4200  1.1175  2.0849  0.1040  1
 ..      ...     ...     ...     ...     ... ..
 115 -3.2798  0.7419  2.7816  0.9615  1.1917  1
 116 -2.3698  1.6707  0.8360  1.0882  1.1563  1
 117 -3.2923 -0.9204  2.7438  0.7083  0.7086  1
 118 -1.0064 -0.4529  1.5155 -3.1937  1.4584  1
 119 -3.0842 -0.1180  0.2250 -1.6448  1.1125  1
 
 [120 rows x 6 columns],
 0.8754321352496983)