<a href="https://colab.research.google.com/github/Arnab9Codes/LSTM-based-oversampling/blob/master/data_generation_version_1_11_27_2019_thesis_replication_done.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
from google.colab import files

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

import os
import warnings
warnings.filterwarnings("ignore")

import time
from datetime import datetime

In [0]:

def preprocessing(dat_data,validation,seed):
    
    cur_dir=os.getcwd()
    df=pd.read_csv('/content/drive/My Drive/codes_first_try/data/'+dat_data)#reading the .dat file or .csv file
    # the folder location might change and come changes in the above line will be required for getting to the folder of the dataset
    
    df_min=df[df['Outcome']==' negative']#getting negative samples
    df_majority=df[df['Outcome']==' positive']#getting positive samples
    
    
    #separating majority and minority data
    #df_min.to_csv('ecoli-0_vs_1_minority.csv',index=False) #converting negative samples dataframe to a csv file
    #df_majority.to_csv('ecoli_0_vs_1_majority.csv',index=False) #converting positive samples dataframe to a csv file
    df_val=df.values
    
    data=np.array(df.values)

    pos=data.shape[1]-1 #getting target column

    for i in range(data.shape[0]):
        if data[i][pos]==' negative':
            data[i][pos]=0
        else:
            data[i][pos]=1


    min_data=np.array(df_min) #getting minority samples
    maj_data=np.array(df_majority) #getting majority samples
    
    validation=validation # set in function call
    seed=seed #set in function call
    
    X=data[:,:pos].astype(float)# getting the feature values
    Y=data[:,pos].astype(int)# getting prediction
    
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=validation,random_state=seed) #making a train test split
    
    Y_train=Y_train.reshape((Y_train.shape[0],1)) #reshaping for latter concatenation purpose
    
    train_Data=np.concatenate((X_train,Y_train),axis=1) #concatenating to produce full training data
    
    Y_test=Y_test.reshape((Y_test.shape[0],1)) # reshaping for latter concatenation purpose
    
    test_Data=np.concatenate((X_test,Y_test),axis=1) #concatenating to produce full test data
    
    #coverting numpy array to dataframe
    train_Data=pd.DataFrame(train_Data)
    test_Data=pd.DataFrame(test_Data)
    
    #saving dataframe to csv files
    #train_Data.to_csv(dat_data+'-seed-'+str(seed)+'-train_Data.csv',index=False)
    #test_Data.to_csv(dat_data+'-seed-'+str(seed)+'-test_Data.csv',index=False) 
    
    return train_Data,test_Data
    

In [0]:
#function for making a series of observations predicting a target
def create_dataset(dataset,look_back=1):
    
    datax,datay=[],[]
    
    for i in range(len(dataset)-look_back-1):
        a=dataset[i:(i+look_back),:]
        datax.append(a)
        datay.append(dataset[i+look_back,:])
    
    return np.array(datax),np.array(datay)

In [0]:
#function for sample generation using LSTM based technique

def LSTM_based_oversampling(dat_data,train_data,validation,seed,series_length):
    
    #df=pd.read_csv('train_Data.csv')# reading only train data separated in data preprocessing 
    df=train_data
    min_class=0 #intiazing
    maj_class=1 #initializing
    
    #print(df.shape)
    
    pos=df.shape[1]-1 #getting target column
    
    #print(pos)
    
    zeros=0
    ones=0
    
    df_val=df.values
    
    col=df.columns[pos]
    
    for i in range(0,df_val.shape[0],1):
        if(df_val[i][pos]==0):
            zeros=zeros+1
        else:
            ones=ones+1
    
    if zeros<=ones:
        min_class=0
        maj_class=1
    else:
        min_class=1
        maj_class=0
    
    #print('zeros: ',zeros,' ones: ',ones)
    #getting only minority samples from the train data
    df_min=df[df[col]==min_class]
    #saving it as a csv file
    
    #print('df_min: ',df_min.shape)
    #df_min.to_csv('minority_train.csv',index=False) #---------------------------------------
    #df_min=pd.read_csv('minority_train.csv')
    
    #print(df_min.shape)
    
    df_majority=df[df[col]==maj_class]
    #print('df_maj: ',df_majority.shape)
    #df_majority.to_csv('majority_train.csv',index=False)
    df_minor=np.array(df_min)
    
    scaler=MinMaxScaler(feature_range=(0,1))
    df_minor=scaler.fit_transform(df_min)
    
    x,y=create_dataset(df_minor,series_length)#converting into a series dataset for prediction
    
    Xtrain,xtest,Ytrain,ytest=train_test_split(x,y,test_size=validation,random_state=seed)# set to 40% test set here for generating 40% more minority data
    
    
    # Code for LSTM based neural network
    model=Sequential()
    model.add(LSTM(20,input_shape=(Xtrain.shape[1],Xtrain.shape[2])))#5
    model.add(Dense(Xtrain.shape[2]))
    
    model.compile(loss='mse',optimizer='adam')
    
    history=model.fit(Xtrain,Ytrain,epochs=500,verbose=0)
    
    prediction=model.predict(xtest)#making prediction, this prediction is the newly generated minority data
    
    prediction=scaler.inverse_transform(prediction)#converting back to original scale
    new_data=pd.DataFrame(prediction) #converting the numpy array into a dataframe for saving it into a csv file
    
    synthetic_data=new_data
    
    synthetic_data[col]=min_class
    
    #new_data.to_csv(dat_data+'-seed-'+str(seed)+'_LSTM_generated_data.csv',index=False)
    
    return synthetic_data
    

In [0]:
def svc_param_selection(X,Y,folds):
    
  Cs=[0.0001,0.001, 0.01, 0.1, 1, 10] #Cs should be increased

  gammas=[0.0001,0.001, 0.01, 0.1, 1, 10]#gammas should be increased
    
  param_grid={'C':Cs,'gamma':gammas}
    
  model=GridSearchCV(SVC(kernel='rbf',probability=True),param_grid,cv=folds)#performing grid_seach with probablity=True
    
  model.fit(X,Y)#fitting the data(training the model on the data)
    
    #finally returning the best estimator
  return model.best_estimator_
    

In [0]:
def svm_classification(train,test,synthetic_data):
    
    extra_data=synthetic_data#pd.read_csv('LSTM_generated_data.csv')# loading extra genereated data genereated by LSTM
    original_data=train#pd.read_csv('train_Data.csv')#original data
    
    pos=extra_data.shape[1]-1   # will be same on code: pos=original_data.shape[1]-1
    
    extra_val=extra_data.values
    
    if extra_val[0][pos]<0.5:
        for i in range(0,extra_data.shape[0],1):
            extra_val[i][pos]=0
    else:
        for i in range(0,extra_data.shape[0],1):
            extra_val[i][pos]=1
    
    values=original_data.values
    
    X=values[:,:pos].astype(float)#getting training dat from original csv file
    Y=values[:,pos].astype(int)#getting target from original training csv file
    
    #now doing the same for extra data generated by lstm_genereated_sample function
    #extra_val=extra_data.values
    
    extra_X=extra_val[:,:pos].astype(float)
    extra_Y=extra_val[:,pos].astype(int)
    
    #now concatenating values
    extrain=np.concatenate((X,extra_X),axis=0)
    eytrain=np.concatenate((Y,extra_Y),axis=0)
    
    #reading test data
    test_data=test#pd.read_csv('test_data.csv')
    test_val=test_data.values
    
    test_X=test_val[:,:pos].astype(float)
    test_Y=test_val[:,pos].astype(int)
    
    #kernel-rbf, svm using only original data, cross_validation=5(vary it if needed)
    #print("without exta: ")
    rbf_svc=svc_param_selection(X,Y,5)

    # secondly rbf_svc_extra for training on original+LSTM_genearated_data (same criterion as before)
    #print('with extra: ')
    rbf_svc_extra = svc_param_selection(extrain,eytrain,5)
    
    rbf_svc.fit(X,Y)
    rbf_svc_extra.fit(extrain,eytrain)
    
    # making predictions on validation dataset using svc trained only on original training data
    prediction_on_real_dataset=rbf_svc.predict_proba(test_X)

    #saving probabilities
    predictions=rbf_svc.predict(test_X)

    # making predictions on validation dataset using svc trained on original training data + LSTM generated data
    prediction_on_real_dataset_adding_extra=rbf_svc_extra.predict_proba(test_X)
    #saving probabilities
    predictions_extra=rbf_svc_extra.predict(test_X)
    
    f1_score_without_extra=f1_score(test_Y,predictions)
    f1_score_with_extra=f1_score(test_Y,predictions_extra)
    
    #print('F1-score without extra: ',f1_score(test_Y,predictions)) # f1_score without extra
    #print('F1-score with extra: ',f1_score(test_Y,predictions)) # f1_score with extra
    
    return  f1_score_without_extra,f1_score_with_extra
    

In [0]:
seed_list=[0,5,9,11,19]
dataset_list=['ecoli-0_vs_1.dat',
              #'diabetes.csv',
              'glass1.dat',            
              'glass0.dat',
              'iris0.dat',
              'wisconsin.dat',
              'yeast1.dat',
              'yeast3.dat',
              'page-blocks0.dat',
              'segment0.dat'
                ]

#code_dir=os.getcwd()

for i in range(0,len(dataset_list),1):
    dat_data=dataset_list[i]
  
    with open(dat_data+'_experiements_.txt','a') as file:
        file.write('date: ')
        file.write(str(datetime.now()))
        file.write("\n\n")
        without_extra=[]
        with_extra=[]
        for j in range(0,len(seed_list),1):
            
            train,test=preprocessing(dat_data,0.30,j) #dat_data,validation,seed
            synthetic_data=LSTM_based_oversampling(dat_data,train,0.40,j,5) #dat_data,train_data,validation,seed,series_length
            f1_score_without_extra,f1_score_with_extra=svm_classification(train,test,synthetic_data)
            
            without_extra.append(f1_score_without_extra)
            with_extra.append(f1_score_with_extra)

            file.write(str(seed_list[j]))
            file.write(',')
            file.write(str(f1_score_without_extra))
            file.write(',')
            file.write(str(f1_score_with_extra))
            
            file.write('\n')
        
        without_extra=np.array(without_extra)
        with_extra=np.array(with_extra)
        
        file.write('\n Mean:\n\n')
        file.write('Without exra\t\tWith extra\n\n')
        file.write(str(np.mean(without_extra)))
        file.write('\t')
        file.write(str(np.mean(with_extra)))
        file.write('\n\nImprovemnet: ')
        file.write(str((np.mean(with_extra)-np.mean(without_extra))*100))
        file.write(' %')
        file.write('\n\nStandard deviatiion without extra: ')
        file.write(str(np.std(without_extra,axis=0)))
        file.write('\nStandard deviation with extra: ')
        file.write(str(np.std(with_extra,axis=0)))
        file.write('\n')
       # os.chdir(code_dir)
    print(dat_data,' is complete.\n')
    #this statement only for google colab to have a visible output file downloaded but it did not worked for me
    #instead search the files section left by clicking >
    #files.download(dat_data+'_experiements_.txt')

drive.flush_and_unmount()# this is to save the changes in google drive

ecoli-0_vs_1.dat  is complete.

glass1.dat  is complete.

glass0.dat  is complete.

iris0.dat  is complete.

wisconsin.dat  is complete.

yeast1.dat  is complete.

yeast3.dat  is complete.

page-blocks0.dat  is complete.

segment0.dat  is complete.

