In [None]:
#Load Libraries
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pickle
import pandas as pd 
import statistics as st
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import plotly.graph_objects as go
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import collections
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.io import arff 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from imblearn.pipeline import Pipeline, make_pipeline
import torch
from torch import nn
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, roc_auc_score



In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#functions to read in and clean data
def labelencode(data):
  """encodes the Y into 0 and 1"""
    labelencoder = LabelEncoder()
    #Assigning numerical values and storing in another column
    Y = data.iloc[:,-1]
    Y = labelencoder.fit_transform(Y)
    return Y
def get_data(list_of_files):
  """creates two lists and appends all the datasets and their Y variable respectively """
  X_ls = []
  Y_ls = []
  for filename in list_of_files:
      df = pd.read_csv(filename)
      Y = labelencode(df)
      X = df.iloc[:,:-1]
      X = X._get_numeric_data()
      Y_ls.append(Y)
      X_ls.append(X)
  return(X_ls, Y_ls)


In [None]:
# Classification Models

def random_forest(data, labels):
  """Takes data(X) and labels(Y) and fits a random forest model with maximum depth 3, using 
  cross validation with 5 folds and returns the mean f1 score"""
  rfc = RandomForestClassifier(max_depth=3, random_state=0)
  score = cross_val_score(rfc, data, labels, cv=5, scoring='f1')
  score = np.mean(score)
  return(score)
  
def rf_smote(x_train, y_train):
  """applies random forest model on x train and y train using SMOTE only on the
   training data in the cross validation"""
  imba_pipeline = make_pipeline(SMOTE(random_state=42), 
                                RandomForestClassifier(max_depth=3, random_state=0))
  score = cross_val_score(imba_pipeline, x_train, y_train, scoring='f1', cv=5)
  score = np.mean(score)
  return(score) 
  
def log_reg(max_iter, X, Y):
  """Applies logistic regression given x features and vector y of labels using 
  cross validation with 5 folds and returns the mean f1 score """
  logisticRegr = LogisticRegression(max_iter=max_iter, solver="liblinear", random_state=0)
  score = cross_val_score(logisticRegr, X, Y, cv=5, scoring='f1')
  score = np.mean(score)
  return(score)
  
def lr_smote(x_train, y_train):
  """Applies logistic regression given x features and vector y of labels using SMOTE, 
  cross validation with 5 folds and returns the mean f1 score """
  imba_pipeline = make_pipeline(SMOTE(random_state=42), LogisticRegression(max_iter=500, solver="liblinear", random_state=0))
  score = cross_val_score(imba_pipeline, x_train, y_train, scoring='f1', cv=5)
  score = np.mean(score)
  return(score) 


In [None]:
#Automating the feature transformations

def feature_trans(y, data, features_list, perc,transformation, imb):
  """applies a transformation for each feature in the data and evaluates the model before and after applying the transformation,
  returns two lists of the names of the features that were classified as either positive or negatiive training samples"""
  
  pos_samples= []
  neg_samples = []
  
  if imb == 'sm':
    original_score = lr_smote(data,y)
  else:
    original_score = log_reg(data, y)
  
  threshold = perc

  for col in features_list:
    if transformation == 'sqr':  
      x = square_root(data, col)
    elif transformation == 'square':
      x = square(data, col)
    elif transformation == 'freq':
      x = freq(data, col)
    elif transformation == 'log':
      x = log(data,col)
    elif transformation == 'sig':
      x = sigmoid(data, col)
    else:
      raise RuntimeError('No specified transformation!')

    if imb == 'sm':
      new_score = lr_smote(x,y)
    else:
      new_score = log_reg(x, y)
    
    if round(new_score, 3) - round(original_score, 3) >= threshold:
      pos_samples.append(col)
    else:
      neg_samples.append(col)
          
  positive_samples = [y for x in pos_samples for y in x]
  negative_samples = [y for x in neg_samples for y in x]
  
  return(positive_samples, negative_samples)

In [None]:
#Percentile Binning
def make_seq(inc):
  """creates a sequence of numbers between 0 and 100, and increments it by the specified inc"""
  x = []
  start = 0
  end = 100

  while start < end:
    start += inc 
    start = round(start,2)
    x.append(start)
  return(x)

def binning_quantile(features_list, data, y, class_, bins):
    """creates a feature representation of the features in the feature list using
    quantiles by binning all the values lower than a given quantile into one bin, returns
    a list of the fixed-size arrays of features along with their label as positive or negative samples"""
  b = {"bin_edges":[], "class": []}
  quantiles = make_seq(0.5)

  for feature in features_list:

    data_feature = list(data[feature])
    x_f_pos = [z for x,z in enumerate(data_feature) if y[x] ==1]
    x_f_neg = [z for x,z in enumerate(data_feature) if y[x] ==0]

    x_f_pos = np.array(x_f_pos).reshape(-1,1)
    x_f_neg = np.array(x_f_neg).reshape(-1,1)

    scaler = MinMaxScaler(feature_range=(-10, 10))
    scaler = scaler.fit(x_f_pos)
    x_f_pos = scaler.transform(x_f_pos)

    scaler = MinMaxScaler(feature_range=(-10, 10))
    scaler = scaler.fit(x_f_neg)
    x_f_neg = scaler.transform(x_f_neg)

    bin_pos = []
    bin_neg = []
    
    for quant in quantiles:
      bin_pos.append(np.percentile(x_f_pos, quant))
      bin_neg.append(np.percentile(x_f_neg, quant))
      
    bin_all = []
    bin_all.extend(bin_pos)
    bin_all.extend(bin_neg)

    b["bin_edges"].append(bin_all) 
    b["class"].append(class_)
      
  training_data = np.array(b["bin_edges"])
  labels = np.array(b["class"])

  return(training_data, labels)

In [None]:
#Quantile Sketch Array
def binning_norm(data, b):
  """takes in a list of values (data) and number of bins b and bins the values of the feature according 
  to the bin edges produced by the KBinsDiscretizer function, and returns the normalized value for each bin"""
  model = KBinsDiscretizer(n_bins=b, encode='ordinal', strategy='uniform')

  model.fit(data.reshape(-1,1))

  bin_edges = model.bin_edges_[0]
  bin_values = []
  bin_edges = np.delete(bin_edges, 0)
  old_ran = -10
  #counting number of values in each bin
  for x,ran in enumerate(bin_edges):
    if x == len(bin_edges)-1:
      num = sum(1 for x in data if x <= ran and x >= old_ran)
    else:
      num = sum(1 for x in data if x < ran and x >= old_ran)
    old_ran = ran
    bin_values.append(num)
  bin_values = [y/len(data) for y in bin_values] #normalizing the count
  return(bin_values)
    
def binning_normalized(features_list, data, y, class_, bins):
  """given a list of features, divides the values into two lists accrording to the class,
  then scales the values and applies binning norm function, then returns the fixed-size arrays of the data"""
  bin_edge_pos = []
  bin_edge_neg = []
  b = {"bin_edges":[], "class": []}
  
  for feature in features_list:

    data_feature = list(data[feature])
    x_f_pos = [z for x,z in enumerate(data_feature) if y[x] ==1]
    x_f_neg = [z for x,z in enumerate(data_feature) if y[x] ==0]

    x_f_pos = np.array(x_f_pos).reshape(-1,1)
    x_f_neg = np.array(x_f_neg).reshape(-1,1)


    scaler = MinMaxScaler(feature_range=(-10, 10))
    scaler = scaler.fit(x_f_pos)
    x_f_pos = scaler.transform(x_f_pos)


    scaler = MinMaxScaler(feature_range=(-10, 10))
    scaler = scaler.fit(x_f_neg)
    x_f_neg = scaler.transform(x_f_neg)

    bin_all = []
    bin_all.extend(binning_norm(x_f_pos, bins))
    bin_all.extend(binning_norm(x_f_neg, bins))

    b["bin_edges"].append(bin_all) 
    b["class"].append(class_)
      
  training_data = np.array(b["bin_edges"])
  labels = np.array(b["class"])

  return(training_data, labels)

In [None]:
#read in the datasets from the drive
files = glob.glob('/content/drive/My Drive/Auto-AI-2019-20/New cleaned classification dataset/*.csv')   # create the list of file
X_ls, Y_ls = get_data(files)

In [None]:
# with open('new_x_ls.pickle', 'wb') as handle:
#     pickle.dump(new_x_ls, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('new_y_ls.pickle', 'wb') as handle:
#     pickle.dump(new_y_ls, handle, protocol=pickle.HIGHEST_PROTOCOL)

# from google.colab import files
# files.download('new_x_ls.pickle') 
# files.download('new_y_ls.pickle') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/new_x_ls.pickle', 'rb') as handle:
  new_x_ls = pickle.load(handle)
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/new_y_ls.pickle', 'rb') as handle:
  new_y_ls = pickle.load(handle)

In [None]:
i = 1
pos_list = []
neg_list = []

for dataset, label in zip(new_x_ls, new_y_ls):
  print("Training dataset " + str(i) + "...")

  scaled_dataset = scale(dataset)

  features = [[col] for col in scaled_dataset.columns]

  count_0 = len([1 for y in label if y == 0])
  count_1 = len([1 for y in label if y == 1])
  min_count = min(count_0,count_1)
  if min_count/len(label) < 0.4: 
    positive_samples, negative_samples = feature_trans(label, scaled_dataset, features, 0.01,"freq","sm")
  else:
    positive_samples, negative_samples = feature_trans(label, scaled_dataset, features, 0.01,"freq","no")

  pos_list.append(positive_samples)
  neg_list.append(negative_samples)

  i +=1 

In [None]:
import pickle
with open('pos_rf_freq.pickle', 'wb') as handle:
    pickle.dump(pos_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('neg_rf_freq.pickle', 'wb') as handle:
    pickle.dump(neg_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

from google.colab import files
files.download('pos_rf_freq.pickle') 
files.download('neg_rf_freq.pickle') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/pos_rf_freq.pickle', 'rb') as handle:
  pos_list = pickle.load(handle)
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/neg_rf_freq.pickle', 'rb') as handle:
  neg_list = pickle.load(handle)

In [None]:
training_data = []
training_labels = []

for i, (dataset, label) in enumerate(zip(new_x_ls,new_y_ls)):
  if len(pos_list[i]) == 0:
      neg_training, neg_labels = binning_normalized(neg_list[i], dataset, label, 0, 200)
      training_data.extend(neg_training)
      training_labels.extend(neg_labels)
  elif len(neg_list[i]) == 0:
    pos_training, pos_labels = binning_normalized(pos_list[i], dataset, label, 1, 200)
    training_data.extend(pos_training)
    training_labels.extend(pos_labels)
  else:
    pos_training, pos_labels = binning_normalized(pos_list[i], dataset, label, 1, 200)
    neg_training, neg_labels = binning_normalized(neg_list[i], dataset, label, 0, 200)
    training_data.extend(pos_training)
    training_data.extend(neg_training) 
    training_labels.extend(pos_labels)
    training_labels.extend(neg_labels) 


Feature 0 is constant and will be replaced with 0.



In [None]:
import pickle
with open('train_rf_freq_qsa.pickle', 'wb') as handle:
    pickle.dump(training, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('label_rf_freq_qsa.pickle', 'wb') as handle:
    pickle.dump(label, handle, protocol=pickle.HIGHEST_PROTOCOL)

from google.colab import files
files.download('train_rf_freq_qsa.pickle') 
files.download('label_rf_freq_qsa.pickle') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/train_rf_freq_qsa.pickle', 'rb') as handle:
  training = pickle.load(handle)
with open('/content/drive/My Drive/Auto-AI-2019-20/mira/label_rf_freq_qsa.pickle', 'rb') as handle:
  label = pickle.load(handle)

In [None]:
del training[594]
del label[594]

In [None]:
training_data = []
training_labels = []

for i, (dataset, label) in enumerate(zip(new_x_ls,new_y_ls)):
  if len(pos_list[i]) == 0:
      neg_training, neg_labels = binning_quantile(neg_list[i], dataset, label, 0, 200)
      training_data.extend(neg_training)
      training_labels.extend(neg_labels)
  elif len(neg_list[i]) == 0:
    pos_training, pos_labels = binning_quantile(pos_list[i], dataset, label, 1, 200)
    training_data.extend(pos_training)
    training_labels.extend(pos_labels)
  else:
    pos_training, pos_labels = binning_quantile(pos_list[i], dataset, label, 1, 200)
    neg_training, neg_labels = binning_quantile(neg_list[i], dataset, label, 0, 200)
    training_data.extend(pos_training)
    training_data.extend(neg_training) 
    training_labels.extend(pos_labels)
    training_labels.extend(neg_labels) 

In [None]:
with open('train_rf_freq_perc.pickle', 'wb') as handle:
    pickle.dump(training_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('label_rf_freq_perc.pickle', 'wb') as handle:
    pickle.dump(training_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

from google.colab import files
files.download('train_rf_freq_perc.pickle') 
files.download('label_rf_freq_perc.pickle') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>