In [2]:
import pandas as pd;
import random;
from statistics import fmean, stdev;

from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import DistanceMetric
from sklearn.ensemble import RandomForestClassifier

OBJECTS/VARIABLES:

Antibody - list of Numerical values (Reals?) , represented by a row of a dataframe?
Population - List[Antibodies], represented by the total dataframe?

Target Size (i.e number of antibodies to be created) = (Size of Majority Class) - (Size of minority class)



FUNCTIONS:

Initializing :- input: original dataframe

                         do: Get bounds of minority class by taking the highest and lowest values in each of the [n] dimensions

                         How: Do we take the whole minority class? or do we sample part or parts of it to generate our bounds. 

                         output: Upper and lower bounds of the minority class


Creation :- Input: Bounds of the min class

        do: Create a set of antibodies

        input: minority Dataframe 

        How:
            Possibilities:
            (As the Malhanabois paper does it) Take a random value between the bounds of the minority class feature as the datapoint

            (Nikhil Just sample minority class based on imbalance rate (doesn't require bounds)

            (Adam) Take a random value, as in the paper's method, but off of a weighted curve? as in we could randomize, 
                but add some preference for values close to the boundary, or close to the center, etc.
                        -We could set this as a parameter, bell curve, linear. This same function could be a parameter in the mutation stage.
                        -If the density within the bounds is concentrated on one side, add bias towards that side in the random value;
        
        Challenges: 
            How do we deal with the different data categories (e.g nominal, ordinal, and continuous)? Continuous is easy, just a number in a range. Ordinal is ???, nominal is difficult, even if one-hot encoded, we might random to have two values that should be exclusive (e.g an item being both blue and red). How do other imputation algorithms work with these problems? DO they even work with these problems?
        output: Initial Population as a DF?



**Initialization**

In [57]:

df = pd.read_csv("./Data/GeneratedSyntheticData-MajorImbalanceNoiseLess.csv")

columns = df.columns.to_list()
columns_drop = columns.pop(-1)

#drop NaN rows, could implement imputer as well
df.dropna(inplace=True)

labels = df.drop(columns, axis=1)

#df= df.drop("5", axis=1)
df= df.drop("Unnamed: 0", axis=1)

df

Unnamed: 0,0,1,2,3,4,5
0,-0.959988,-9.999716,9.847539,-0.555990,-0.636094,0.0
1,-0.186506,-10.844136,10.987090,-0.520116,-0.996379,0.0
2,-0.589801,-10.489529,10.047838,0.891494,0.774165,0.0
3,0.045731,-9.905474,8.408245,0.314930,2.835709,0.0
4,1.510713,-10.549865,10.039717,0.324333,1.067905,0.0
...,...,...,...,...,...,...
295,1.778231,10.844854,-9.486344,0.335167,0.015292,1.0
296,0.335803,-10.064810,7.958462,-0.760555,1.123511,0.0
297,-1.188499,-10.304653,10.862577,-0.361274,1.299642,0.0
298,-1.536921,-10.027084,11.225361,2.045722,0.720253,0.0


In [8]:

count_nan = df.isnull().sum()
count_nan


Unnamed: 0    0
0             0
1             0
2             0
3             0
4             0
dtype: int64

In [58]:
#TODO Gaussian generation can be optimized by altering how the loops work, putting the col loop on the outside and pre-generated fmean, stdev for the col
        #Currently it generates those values for every antibody
        

def get_bounds(minorityDF) -> dict:
    out = {}
    for col in minorityDF:
        colMax = df[col].max()
        colMin = df[col].min()
        out[col] = (colMin, colMax)
    return out

#This only works for continuous values. We will have to code a version for binary fields (We assume any categorical columns have been encoded)

####### Creation ################
# minorityDF - dataframe containing the minority class
# totalPopulation - The total number of antibodies to create
# weightingFunction - Can choose between uniform, triangular, ...
# mode - for use with a triangular function - set to the percentage of the range you wish to be most represented (between 0.0 and 1.0)
def Creation(minorityDF, totalPopulation : int, weightingFunction : str = "uniform", mode : float = 0.5): 
    
    if(minorityDF.isnull().values.any()):
        raise ValueError("Minority Class DataFrame contains NaN")
    
    population = [] #Initializing the empty population
    if mode < 0.0 or mode > 1:
        raise Exception("mode must be between float value between 0.0 and 1.0")
    
    if weightingFunction not in ('uniform', 'triangular', 'gaussian'):
        raise Exception("Unknown function chosen, please use one of 'uniform', 'triangular', or 'gaussian'")

    bounds = get_bounds(minorityDF)
    for i in range(totalPopulation): #For every antibody to be created

        antibody = [] #Initializing a single antibody
        if weightingFunction in ["uniform", "triangular"]: #If Generating via uniform or triangular distribution, loop through bounds of columns
            
            for key,bnd in bounds.items(): #Iterate through the columns/dimensions/features of the minority class for each antibody 
                if weightingFunction == "uniform":
                    antibody += [round(random.uniform(bnd[0],bnd[1]),4)] #Add a random value between the lower and upper bounds to the antibody

                elif (weightingFunction == "triangular"):
                    
                    tri_tip = ( ((bnd[1]-bnd[0]) * mode) + bnd[0] ) #multiplying the difference by the percentage, plus the low bound gives us the point between the two, but percentile

                    if tri_tip < bnd[0]: #Error checks to make sure that the emphasized point isn't outside the bounds
                        tri_tip = bnd[0]
                    elif tri_tip > bnd[1]:
                        tri_tip = bnd[1]

                    antibody += [round( random.triangular(bnd[0],bnd[1], tri_tip), 5)]

            population+=[antibody] #add the created antibody to the population

        elif (weightingFunction == 'gauss'): #If Generating via Gaussian, loop through columns of dataframe

            for bnd in minorityDF: #Iterate over columns in the dataframe
                values = minorityDF[bnd].tolist() #convert series to list

                antibody += [round(random.gauss(fmean(values) , stdev(values)), 5)] #using median and stdeviation of values, radomize over gauss

        
            population+=[antibody] #add the created antibody to the population

            
    popDF = pd.DataFrame(population, columns = minorityDF.columns.values)
    return popDF, bounds
    

Creation(df,10000, weightingFunction='uniform')


(           0       1        2       3       4       5
 0     2.4368  1.9177   1.6608  0.2362 -0.3091  0.5899
 1    -2.2719 -4.1058  11.3859 -0.1084  2.8513  0.8354
 2    -1.2484  8.8318   5.5858 -1.7250 -0.4610  0.2861
 3     0.6588  7.8523  -2.1024  1.0521  1.3414  0.0859
 4     2.5999  1.8873  10.7960  0.9912 -1.0562  0.1227
 ...      ...     ...      ...     ...     ...     ...
 9995 -1.5252 -4.5717  -5.7480 -0.3311  0.7952  0.2663
 9996  2.7150 -1.1466   2.6633  0.7149 -0.7419  0.2963
 9997  1.2404  1.6063   4.7684 -2.3146 -1.3115  0.9316
 9998 -1.8125  5.9054  -6.2362 -1.4305  2.6445  0.9794
 9999 -1.2711  3.2615  11.1042 -0.3715 -2.0625  0.0001
 
 [10000 rows x 6 columns],
 {'0': (-2.83587246523818, 3.3073122195178764),
  '1': (-10.844136084811906, 12.021357377821474),
  '2': (-11.191763096168955, 12.60735524166222),
  '3': (-2.746516438764028, 2.361250880746925),
  '4': (-2.538653010973764, 3.1111367939679653),
  '5': (0.0, 1.0)})

**Mutation**

In [55]:
def mutationRound (antiPopulation, bounds):
    for col in antiPopulation:

        bnd_range = bounds[col][1] - bounds[col][0] #total range of bounds is high - low

        #Setting the low and high bounds to be centered around 0
        hi_bnd = bnd_range/2 
        low_bnd = (0-bnd_range/2)

        #print("Low bound around 0 = " + str(low_bnd) +"| Hi bnd around 0 = "+ str(hi_bnd))
        #print(round(random.uniform(low_bnd,hi_bnd),4))

        antiPopulation[col] = antiPopulation[col].map(lambda x : x+round(random.uniform(low_bnd,hi_bnd),4))
        
    return antiPopulation

test = Creation(df,10000, weightingFunction='uniform')


mutationRound(test[0], test[1])

Unnamed: 0,0,1,2,3,4
0,2.4522,20.2589,2.6026,0.7352,-0.6994
1,-0.2516,-3.0509,15.8073,0.4868,-1.0507
2,-1.5835,-1.7165,-8.8795,2.3028,1.7523
3,1.3244,-4.7403,-0.2460,-0.3598,-0.0231
4,1.1640,9.3075,2.0310,-0.4171,-1.6128
...,...,...,...,...,...
9995,3.6913,3.7779,-2.3678,1.9760,-2.2025
9996,-1.2980,3.1290,-4.6459,-1.6600,3.4355
9997,1.4693,6.7726,6.4101,1.9729,-2.0304
9998,-2.7345,0.0533,-0.2604,-0.4206,-0.4365


**Fitness Function**

Requirements: Needs to be calced fast bc of multiple iterations
Posiibilities: - Binary Classification F1 Score, Mahalanobis Distance?
               - Other Types as well? : Linear Regression, Multiilabel Classification

Do we just impute our values and then do something similar to StudentPerformance and see what happens? No bc we need input from the fitness function to do our generations.

Is the data just our training set?
Inputs: Model(initialized outside function or inside?) fit with data that has been encoded and the label

Want to do kfold cv (not every loop bc very slow, once afterwards to evaluate)

when we do k fold, call fitness funciton k times i.e. once for every train test split.

if doing grid search, do it before calling this?

In [None]:
# calculates the fitness score for one train/test split dataset
# run on original dataset without random values first to be abe to compare


# def fitness(train_feat, test_feat, train_label, test_label, model):

#     model.fit(train_feat,train_label)
#     predictions = model.predict(test_feat) 

#     return f1_score(test_label, predictions, average='macro')

# def kfold_cv(n, feat, label):

#     kf = KFold(n_splits = n , random_state=None, shuffle=False)
#     for train_index, test_index in kf.split(df):
#         train_feat, test_feat = feat[train_index], feat[test_index]
#         train_label, test_label = label[train_index], label[test_index]

def fitness( model, feat, label, iterations, scorer):
    #scorer is the name of the function wee aree using to evaluate our dataset
    #it should be a function with signature scorer(model, feature, label) which should return only a single value.
    return cross_val_score(model, feat, label, cv = iterations, scoring = scorer)

def distance( x, y, metric):
    
    #get the distance between two sets of data x and y, they should be the same size
    #metric is the string metric to be used to measure distance

    dist = DistanceMetric.get_metric(metric)
    return dist.pairwise(x,y)

In [None]:



randomForest = RandomForestClassifier()
#randomForest = randomForest.fit(df,labels)
clf = svm.SVC(random_state=0)

fitness(clf, df, labels.values.ravel(), 5, 'recall_macro')


array([0.65982906, 0.65413105, 0.68304843, 0.68490028, 0.67054264])