In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import (LabelEncoder, KBinsDiscretizer, StandardScaler, MinMaxScaler)
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import Adam, Nadam
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.callbacks import ModelCheckpoint
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import confusion_matrix
import sklearn
import tensorflow as tf
from google.colab import files
import pickle
from sklearn.linear_model import (LogisticRegression, LogisticRegressionCV)
from sklearn.datasets import make_classification
from sklearn.ensemble import (GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor)
from imblearn.over_sampling import SMOTE
import collections
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import itertools
import random
from random import sample
from imblearn.pipeline import Pipeline, make_pipeline

import warnings
import sys

if not sys.warnoptions:
    warnings.simplefilter("ignore")

warnings.simplefilter(action='ignore', category=FutureWarning)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
def labelencode(data):
    labelencoder = LabelEncoder()
    #Assigning numerical values and storing in another column
    Y = data.iloc[:,-1]
    Y = labelencoder.fit_transform(Y)
    return Y

In [None]:
import glob
test_files = glob.glob('/content/drive/My Drive/TESTING DATASETS/clas_testing_datasets/*.csv')   # create the list of file

test_X_ls = []
test_Y_ls = []
for filename in test_files:
    df = pd.read_csv(filename)
    Y = labelencode(df)
    X = df.iloc[:,:-1]
    X = X._get_numeric_data()
    test_Y_ls.append(Y)
    test_X_ls.append(X)

In [None]:
# prepare test data

def scale(X, y):
  '''
  scale the test data
  '''
  scaled_X = preprocessing.MinMaxScaler().fit_transform(X)
  X = pd.DataFrame(data = scaled_X, columns = X.columns)

  scaled_data = X.copy()
  scaled_data['class'] = y

  return X, scaled_data

def make_seq(inc):
  x = []
  start = 0
  end = 100

  while start < end:
    start += inc 
    start = round(start,2)
    x.append(start)
  return(x)

def percentiles(x, b):
  # return np.percentile(x, np.linspace(0,100,num = b))
  c = 100/b
  return np.percentile(x, make_seq(c))

def qsa(x, lb, ub, b):
  '''
  return a list of length b within range (lb, ub) representing feature x
  '''
  width = (ub - lb) / b
  x = np.sort(x)
  res = [0] * b
  for i in x:
    num = int((i - lb) // width)
    if num == b:
      res[b-1] += 1
    else:
      res[num] += 1
  res = [i / len(x) for i in res]
  return res

def bin_edge(X, scaled_data, lb, ub, b, qsa_):
  '''
  return a representation of features in a dataset
  '''
  col_names = X.columns

  pos = scaled_data[scaled_data['class'] == 1]
  neg = scaled_data[scaled_data['class'] == 0]
  
  #scale to range(-10,10)
  pos_scaler = MinMaxScaler(feature_range=(lb, ub)).fit_transform(pos[col_names])
  pos_scaler = pd.DataFrame(data = pos_scaler, columns = col_names)
  neg_scaler = MinMaxScaler(feature_range=(lb, ub)).fit_transform(neg[col_names])
  neg_scaler = pd.DataFrame(data = neg_scaler, columns = col_names)

  combos = list(itertools.combinations(col_names,2))

  training_data = []

  for combo in combos:

    bin_all = []

    if qsa_ == True:
      bin_all.extend(qsa(pos_scaler[combo[0]], lb, ub, b)) 
      bin_all.extend(qsa(neg_scaler[combo[0]], lb, ub, b))
      bin_all.extend(qsa(pos_scaler[combo[1]], lb, ub, b))
      bin_all.extend(qsa(neg_scaler[combo[1]], lb, ub, b))

    else:
      # quantile
      bin_all.extend(percentiles(pos_scaler[combo[0]], b))
      bin_all.extend(percentiles(neg_scaler[combo[0]], b))
      bin_all.extend(percentiles(pos_scaler[combo[1]], b))
      bin_all.extend(percentiles(neg_scaler[combo[1]], b))

    training_data.append(bin_all)

  return training_data

def create_testdata(X, Y, bins, qsa_):
  '''
  return test data to be fed into mlps
  '''
  test_data = []
  
  X, scaled_data = scale(X, Y)
  test = bin_edge(X, scaled_data, -10, 10, 200, True)
  test_data.extend(test)
  
  return test_data

# Output recommendation results

### output all transformations per dataset

In [None]:
# recommend all
def recommend(path, b, X, Y, qsa_):
  # iterate over all models
  models_ls = glob.glob(path)

  # res = []

  test_new = X.copy()
  # for one test data
  test_data = create_testdata(X, Y, b, qsa_)

  colnames = X.columns

  combos = list(itertools.combinations(colnames,2))

  row = []
  scores = []

  for m in models_ls:
    row.append(m.split('/')[-1][:-3])
    model = tf.python.keras.models.load_model(m)
    pred = model.predict(np.array(test_data), verbose=1)
    # the score for positive label
    scores.append(pred[:,1])

  result = pd.DataFrame(scores, columns = combos, index = row)
  
  # print(result.head())
  # print('Recommended transformation for ',test_files[i].split('/')[-1][:-4])
  # print('\n')

  dict_ = {}

  cnt = 0
  
  for combo in combos:
    sorted_result = result.sort_values(axis = 0, by = combo, ascending = False)
    rcm_trans = sorted_result[sorted_result[combo] >= 0.5].index.tolist()
    dict_[combo] = rcm_trans
    print(combo,': ',rcm_trans)
    if len(rcm_trans) > 0:
      cnt += 1

  if cnt == 0:
    print('No transformation is recommended for this dataset!!!')
    
  # res.append(dict_)
  
  # print('\n')

  return dict_, cnt

In [None]:
# classifiers
def lr_smote(x_train, y_train):
  imba_pipeline = make_pipeline(SMOTE(random_state=42), LogisticRegression(max_iter=500, solver="liblinear", random_state=0))
  score = cross_val_score(imba_pipeline, x_train, y_train, scoring='f1', cv=5)
  score = np.mean(score)
  return(score) 
def random_forest(data, labels):
  rfc = RandomForestClassifier(max_depth=3, random_state=0)
  score = cross_val_score(rfc, data, labels, cv=5, scoring='f1')
  score = np.mean(score)
  return(score)
  
def rf_smote(x_train, y_train):
  imba_pipeline = make_pipeline(SMOTE(random_state=42), 
                                RandomForestClassifier(max_depth=3, random_state=0))
  score = cross_val_score(imba_pipeline, x_train, y_train, scoring='f1', cv=5)
  score = np.mean(score)
  return(score) 
  
def log_reg(max_iter, X, Y):
  """Applies logistic regression given x features and vector y of labels """
  logisticRegr = LogisticRegression(max_iter=max_iter, solver="liblinear", random_state=0)
  score = cross_val_score(logisticRegr, X, Y, cv=5, scoring='f1')
  score = np.mean(score)
  return(score)

def smote(X,Y):
  """Applies smote to fix class imbalance"""
  smt = SMOTE()
  x_train, y_train = smt.fit_sample(X, Y)
  x = pd.DataFrame(data = x_train, columns = X.columns)
  return(x, y_train)


## transform functions

In [None]:
# binary transformation
def add(data, feature1, feature2):
  data_new = data.copy()
  data_new['new'] = data_new[feature1] + data_new[feature2]
  #data_new.drop(axis=0,columns=[feature1,feature2],inplace=True)
  return(data_new)

def multiply(data, feature1, feature2):
  data_new = data.copy()
  data_new['new'] = data_new[feature1] * data_new[feature2]
  return(data_new)

def subtract(data, feature1, feature2):
  data_new = data.copy()
  data_new['new'] = abs(data_new[feature1] - data_new[feature2])
  #data_new.drop(axis=0,columns=[feature1,feature2],inplace=True)
  return(data_new)

# Apply transformations

In [None]:
def apply_trans(dict_, X, Y):

  score_before_ls = []
  score_after_ls = []
  diff_ls = []
  file_ls = []
  
  test_original = X.copy()
  label = Y

  test_original, scaled_data = scale(test_original, label)

  colnames = X.columns

  # apply transformation
  methods = {'add': add, 'subtract':subtract, 'multiply':multiply}
  test_new = test_original.copy()

  # dict_ = res[i]
  
  '''
  # apply all transformations on a feature
  for col in colnames:
    if len(dict_[col]) > 0:
      for i in dict_[col]:
        test_new = methods[i](test_new, col)
      test_new.drop(columns=[col],inplace = True)
  '''
  '''
  # apply one transformation on a feature
  for col in colnames:
    if len(dict_[col]) > 0:
      test_new = methods[dict_[col][0]](test_new, col)  
  

  '''
  # corresponding to one transformation per dataset
  combo = list(dict_.keys())[0]
  trans = dict_[combo]
  test_new = methods[trans](test_new, combo[0], combo[1])
  
  return test_new, label


In [None]:
def eval(avg, file_name, test_original, test_new, label, *args):

  if avg == True:
    score_log_reg0 = log_reg(500, test_original, label)
    score_rf0 = random_forest(test_original, label, 5)
    score_gb0 = gb(test_original, label, 0.04)

    score_before = (score_log_reg0 + score_rf0 + score_gb0) / 3

    score_log_reg1 = log_reg(500, test_new, label)
    score_rf1 = random_forest(test_new, label, 5)
    score_gb1 = gb(test_new, label, 0.04)

    score_after = (score_log_reg1 + score_rf1 + score_gb1) / 3

  else:
    model = args[0]

    count_0 = len([1 for y in label if y == 0])
    count_1 = len([1 for y in label if y == 1])
    min_count = min(count_0,count_1)
    

    if model == 'lr':
      if min_count/len(label) < 0.4: 
        score_before = lr_smote(test_original, label)
        score_after = lr_smote(test_new, label)
      else:
        score_before = log_reg(500, test_original, label)
        score_after = log_reg(500, test_new, label)

    elif model == 'rf':
      if min_count/len(label) < 0.4: 
        score_before = rf_smote(test_original, label)
        score_after = rf_smote(test_new, label)
      else:
        score_before = random_forest(test_original, label)
        score_after = random_forest(test_new, label)

    elif model == 'gb':
      score_before = gb(test_original, label, 0.02)
      score_after = gb(test_new, label, 0.02)

  
  if score_after > score_before:
    improve = ' improved'
  else:
    improve = ' not improved'

  diff = score_after - score_before

  print(file_name, ' original score: ',score_before,'; score after: ',score_after, '; difference: ', diff ,improve)
  # print(test_files[i].split('/')[-1][:-4], improve)

  return score_before, score_after, diff, file_name

In [None]:
# combanation of trasformations
def multiple_testing(path, b, n, model, qsa_):
  score_before_ls = []
  score_after_ls = []
  diff_ls = []
  file_ls = []

  for i in range(len(test_files)):
    X = test_X_ls[i].copy()
    Y = test_Y_ls[i].copy()
    X_original, X_scaled = scale(X,Y)
    for j in range(n):
      res, cnt = recommend(path, b, X, Y, qsa_)
      if cnt > 0:
        X_new, label = apply_trans(res, X, Y)
        X = X_new
      else:
        X = X_original
    score_before, score_after, diff, file = eval(False, test_files[i].split('/')[-1][:-4], X_original, X, test_Y_ls[i], model)
    # if score_before <= 0.999 and score_before > 0:
    score_before_ls.append(score_before)
    score_after_ls.append(score_after)
    diff_ls.append(score_after - score_before)
    file_ls.append(test_files[i].split('/')[-1][:-4])

  return score_before_ls, score_after_ls, diff_ls, file_ls
