# Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

# Train

In [None]:
def train_rf(train_data,train_labels,test_data,test_labels):
    test_size = int(len(labels_test)/2)
    best_test_acc = 0
    best_val_acc = 0
    macro_p = 0
    micro_p = 0
    macro_f1 = 0 
    micro_f1 = 0
    macro_r = 0
    micro_r = 0
    r = 0
    p = 0
    f1 = 0
    for depth in range(1,35):
        clf = RandomForestClassifier(n_estimators=100, max_depth=depth,random_state=0)
        clf.fit(train_data,train_labels)
        print('Max Depth',depth)
        print('Train Acc',clf.score(train_data,train_labels))
        print('Test Acc',clf.score(test_data_balanced[:test_size],test_labels[:test_size]))
        print('Validation Acc',clf.score(test_data_balanced[test_size:],test_labels[test_size:]))

        if(clf.score(test_data_balanced[test_size:],test_labels[test_size:]) >= best_val_acc):
            best_val_acc = clf.score(test_data_balanced[test_size:],test_labels[test_size:])
            best_test_acc = clf.score(test_data_balanced[:test_size],test_labels[:test_size])

            macro_p = precision_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='macro')
            micro_p = precision_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='micro')

            macro_f1 = f1_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='macro')
            micro_f1 = f1_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='micro')

            macro_r = recall_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='macro')
            micro_r = recall_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]) , average='micro')

            p = precision_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]))
            r = recall_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]))
            f1 = f1_score(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]))

            cf = confusion_matrix(test_labels[:test_size],clf.predict(test_data_balanced[:test_size]))
    print("macro_p ",macro_p)
    print("micro_p ",micro_p)
    print("macro_r ",macro_r)
    print("micro_r ",micro_r)
    print("macro_f1 ",macro_f1)
    print("micro_f1 ",micro_f1)
    print("precission ",p)
    print("recall ",r)
    print("f1 ",f1)
    print(cf)
    return best_val_acc,best_test_acc
    
    


# Running 5 Times

In [None]:
for inter in range(5):
    filename = "330_Days_Data.csv"
    dataset = pd.read_csv(filename,sep=",")
    data = np.array(dataset)
    data = data[:,1:]

    gen_filename = 'generated_samples_'+str(inter)+'_50000.csv'
    gen_samples = np.genfromtxt(gen_filename,delimiter=",")
    gen_samples = gen_samples.reshape(-1,30,9)


    n_features = np.shape(data)[1]
    n_days = int(filename.split("_")[0])
    n_days_used = 30
    n_samples = int(len(data)/n_days)
    percentage = 0.7
    train_data = data[:int(n_samples*percentage)*n_days]
    test_data = data[int(n_samples*percentage)*n_days:]

    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)
   

    labels_train = []
    labels_test = []
    train_data_balanced = []
    test_data_balanced = []
    extra_data = []
    counter = 0
    
    for i in range(len(train_data)):
        if(i%330 >= 300):
            train_data_balanced.append(train_data[i])
            labels_train.append(1)
        else:
            train_data_balanced.append(train_data[i])
            labels_train.append(0)
            
    for i in range(len(test_data)):
        if(i%330 >= 300):
            test_data_balanced.append(test_data[i])
            labels_test.append(1)
        else:
            test_data_balanced.append(test_data[i])
            labels_test.append(0)
    
    gen_samples = np.reshape(gen_samples,(-1,9))
    label_gen = np.ones(len(gen_samples))

    aug_data = np.vstack((gen_samples,train_data_balanced))
    aug_label = np.hstack((label_gen,labels_train))

    sm = SMOTE(random_state=inter)
    smote_samples, smote_labels = sm.fit_resample(train_data_balanced, labels_train)

    ad = ADASYN(random_state=inter)
    adasyn_samples, adasyn_labels = ad.fit_resample(train_data_balanced, labels_train)

    rd = RandomOverSampler(random_state=inter)
    rd_samples, rd_labels = rd.fit_resample(train_data_balanced, labels_train)
    
    
    
    
    
    print("data augmented by gan")
    gan =train_rf(aug_data,aug_label,test_data_balanced,labels_test)
    print("data augmented by SMOTE")
    smote = train_rf(smote_samples,smote_labels,test_data_balanced,labels_test)
    print("data augmented by ADASYN")
    adasyn = train_rf(adasyn_samples,adasyn_labels,test_data_balanced,labels_test)
    print("data augmented by Random Oversampler")
    random_oversampling = train_rf(rd_samples,rd_labels,test_data_balanced,labels_test)
    print("original data")
    original = train_rf(train_data_balanced,labels_train,test_data_balanced,labels_test)
