# **COMP90051 Assignment 1 2023**


### Imports

In [None]:
import json
import torch
import torch.cuda
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import time, os
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline


from sklearn.preprocessing import OneHotEncoder


import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from torch.utils import data
from torch.utils.data import TensorDataset, DataLoader

from google.colab import drive
drive.mount('/content/drive/')

### Utilities


In [1]:

def load_data(path):
  with open(path) as file:
    data = [json.loads(row) for row in file]
  file.close()
  return data
def l_df(data):
  return pd.DataFrame(data)


#takes in a df object and builds a tensor matrix of width = 5000 i.e the number of unique words
#returns the tensor matrix
def build_freq_dataset_bow(data_src,col_name):
    freq_dataset = torch.zeros([data_src.shape[0], 5000])
    i = 0 #keeps track of the current row
    for row_text in data_src[col_name]:
        for row_text_index in row_text:
            freq_dataset[i][row_text_index] += 1
        i += 1
    return freq_dataset

#takes in a df
def build_padded_dataset(data_src,col_name,max_col):
    d = data_src.copy(deep=True)
    # d = data_src.iloc[0:0]
    #df.drop(df[df['Fee'] >= 24000].index, inplace = True)
    # print(d.info())
    dataset = torch.zeros([d.shape[0], max_col])

    row_ind = 0
    for i in d[col_name]:
        diff = max_col - len(i)

        # if(len(i) == 0): #ignore samples with 0 length
        #   continue

        for u in range(len(i)):
            #d[u] = data_src[u] + 1
            i[u] += 1

        for u in range(diff):
            i.append(0)

        dataset[row_ind] = torch.Tensor(i)
        row_ind += 1
    # del d
    return dataset

#extracts the labels from the source df and returns a tensor.
#this is used to build the tensor data loader
def extract_output_tensor(data_src,col_name):
    response = torch.zeros([data_src.shape[0], 1])
    i = 0
    for row_response in data_src[col_name]:
        response[i] = row_response
        i+=1
    return response

def extract_output_tensor_multi(data_src,ai_col_name,model_name):
    response = torch.zeros([data_src.shape[0], 1])
    model = torch.zeros([data_src.shape[0], 1])
    #extract response
    i = 0
    for row_response in data_src[ai_col_name]:
        response[i] = row_response
        i+=1

    #extract model
    i = 0
    for row_model in data_src[model_name]:
        model[i] = row_model
        i+=1
    return response, model

#takes in a df data_src of 2 columns for X and output Y and returns a tensor data loader object
#a tensor breaks down a large dataset into smaller chunks for lesser memory usage
#default batch size is 128
def get_torch_data_loader(data_src,data_col_name,label_col_name,is_test,max_col,batch_size=20, shuffle=False):
    x = build_padded_dataset(data_src,data_col_name,max_col)
    #x = build_x(data_src,data_col_name)
    if is_test:
        ds = TensorDataset(x) # just load x for test
        return DataLoader(ds, batch_size=batch_size, shuffle=shuffle) # create your dataloader
    else:
        y = extract_output_tensor(data_src,label_col_name)
        ds = TensorDataset(x,y)
        return DataLoader(ds, batch_size=batch_size, shuffle=shuffle) # create your dataloader

def build_tensor_dataset(data_src,data_col_name,label_col_name,is_test,data_format,max_col):
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.empty(0,0)

    if data_format == "bow":
        x = build_freq_dataset_bow(data_src,data_col_name)
    elif data_format == "padding":
        x = build_padded_dataset(data_src,data_col_name,max_col) #pad data according to max_col

    if is_test:
        return TensorDataset(x)
    else:
        y = extract_output_tensor(data_src,label_col_name)
        return TensorDataset(x,y)

def build_tensor_dataset_domain2(data_src,data_col_name,label_col_name,is_test,data_format,max_col):
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.empty(0,0)

    if data_format == "bow":
        x = build_freq_dataset_bow(data_src,data_col_name)
    elif data_format == "padding":
        x = build_padded_dataset(data_src,data_col_name,max_col) #pad data according to max_col

    if is_test:
        return TensorDataset(x)
    else:
        y, model = extract_output_tensor_multi(data_src,label_col_name,"model")
        model = torch.nan_to_num(model, nan=7.0) #set human generated model value to 7
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(model)
        model = ohe.transform(model)
        model = torch.tensor(model)
        # print(model.shape)
        # print(x.shape)
        # print(y.shape)
        # y = extract_output_tensor(data_src,label_col_name)

        return TensorDataset(x,y,model)

def df_to_tensor(data_src,col_name):
    d = data_src.copy()
    t = torch.zeros(0,0)

    row_ind = 0
    #loop rows
    for i in d[col_name]:

        t[row_ind] = torch.Tensor(i)
        row_ind += 1

    return t



Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Data Preprocessing

In [2]:
from sklearn.model_selection import train_test_split

cv_test_size = 0.2
#Train Data

domain1_train = load_data('Holdout Data/domain1_train.json')
domain2_train = load_data('Holdout Data/domain2_train.json')
hold_out = load_data('Holdout Data/holdout.json')
hold_out2 = load_data('Holdout Data/holdout 2.json')
#Test Data
test_set = load_data('test_set.json')

#Dataframes
domain1_df = l_df(domain1_train)
domain2_df = l_df(domain2_train)
domain2_multi_df = l_df(domain2_train)
holdout_df = l_df(hold_out)
holdout2_df = l_df(hold_out2)
test_df = l_df(test_set)


In [None]:
#Create holdout set (Run once)
domain2_train = load_data('./Holdout Data/domain2_train.json')
domain2_df = l_df(domain2_train)

zero = domain2_df[domain2_df["model"].isin([0])].sample(n=71, replace=False)
one = domain2_df[domain2_df["model"].isin([1])].sample(n=71, replace=False)
two = domain2_df[domain2_df["model"].isin([2])].sample(n=71, replace=False)
three = domain2_df[domain2_df["model"].isin([3])].sample(n=72, replace=False)
four = domain2_df[domain2_df["model"].isin([4])].sample(n=71, replace=False)
five = domain2_df[domain2_df["model"].isin([5])].sample(n=72, replace=False)
six = domain2_df[domain2_df["model"].isin([6])].sample(n=72, replace=False)

domain2_df = domain2_df.drop(zero.index)
domain2_df = domain2_df.drop(one.index)
domain2_df = domain2_df.drop(two.index)
domain2_df = domain2_df.drop(three.index)
domain2_df = domain2_df.drop(four.index)
domain2_df = domain2_df.drop(five.index)
domain2_df = domain2_df.drop(six.index)

holdout_df = pd.concat([zero,one,two,three,four,five,six])

domain2_df.to_json('./Holdout Data/domain2_train_afterholdout.json',orient='records',lines=True)
holdout_df.to_json('./Holdout Data/out.json',orient='records',lines=True)

print(domain2_df.shape)

(13900, 3)


In [None]:
#CREATE data loader for domain 2 - using a sampled approach
domain2_train = load_data('Holdout Data/domain2_train.json')
domain2_df = l_df(domain2_train)

#target 0:machine generated 1:human generated
max_col = 1075
numDataPoints = domain2_df.shape[0]
bs = 4000

minority_count = domain2_df[domain2_df["label"] == 1].shape[0] #count for human generated
majority_count = domain2_df[domain2_df["label"] == 0].shape[0] #count for hAIuman generated

x = build_padded_dataset(domain2_df,"text",max_col) #pad data according to max_col
y = extract_output_tensor(domain2_df,"label")

y = y.squeeze()
y = y.numpy()
y = y.astype(int)

class_sample_count = np.array([majority_count,minority_count])

weight = 1. / class_sample_count


# #for target value 0, replace with 1/majority_value, likewise for target value 1 and minority_value
# 1 -> 0
# 0 > 1
samples_weight = np.array([weight[t] for t in y])
samples_weight

samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()

sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, 18500)

y = torch.from_numpy(y).long()
y = torch.reshape(y, (len(y), 1))

domain2_sampled_ds = torch.utils.data.TensorDataset(x, y)

train_loader = DataLoader(domain2_sampled_ds, batch_size=bs, num_workers=1, sampler=sampler)

for i, (data, target) in enumerate(train_loader):
    print("batch index {}, 0/1: {}/{}".format(
        i,
        len(np.where(target.numpy() == 0)[0]),
        len(np.where(target.numpy() == 1)[0])))

#### TensorDatasets

In [3]:

max_col = 1075 #maxmi
#get a complete dataset of domain1 and domain2
# all_data_df = pd.concat([domain1_df, domain2_df]).sample(frac=1).reset_index(drop=True)
# all_data_ds = build_tensor_dataset(all_data_df,"text","label",False,"padding",max_col)

#domain only datasets
# domain1_ds = build_tensor_dataset(domain1_df,"text","label",False,"padding",max_col)
domain1_ds = build_tensor_dataset(domain1_df,"text","label",False,"padding",max_col)
# # print(domain2_df.head())
# domain2_ds = build_tensor_dataset(domain2_df,"text","label",False,"padding",max_col)

# # print(domain2_df.head())
# # domain2_1_ds = build_tensor_dataset(model1_df,"text","label",False,"padding",max_col)
# # domain2_2_ds = build_tensor_dataset(model2_df,"text","label",False,"padding",max_col)
# # domain2_3_ds = build_tensor_dataset(model3_df,"text","label",False,"padding",max_col)
# # domain2_4_ds = build_tensor_dataset(model4_df,"text","label",False,"padding",max_col)
# # domain2_5_ds = build_tensor_dataset(model5_df,"text","label",False,"padding",max_col)
# # domain2_6_ds = build_tensor_dataset(model6_df,"text","label",False,"padding",max_col)
# # domain2_7_ds = build_tensor_dataset(model7_df,"text","label",False,"padding",max_col)

holdout_ds = build_tensor_dataset(holdout_df,"text","label",False,"padding",max_col)
holdout2_ds = build_tensor_dataset(holdout2_df,"text","label",False,"padding",max_col)
#domain 2 dataset with 2150 human generated and 2150 machine (equally distributed among the 7 models)
# domain2_equal_parts_ds = build_tensor_dataset(domain2_AI_generated_balanced_df,"text","label",False,"padding",max_col)
# alldomain1_domain2equal_ds = build_tensor_dataset(alldomain1_domain2equal_df,"text","label",False,"padding",max_col)
test_data = build_padded_dataset(test_df,"text",max_col)
domain2__multi_ds = build_tensor_dataset_domain2(domain2_multi_df,"text","label",False,"padding",max_col)

### Train & Test

In [4]:
from numpy.core.numeric import outer
#input two tensors, returns a percent value of acurrate predictions
def binary_classification_accuracy(predicted,actual):

    val = torch.eq(predicted.round().detach(), actual).float()
    correct_count = (val == True).sum(dim=0).item()
    incorrect_count = actual.size(0)- correct_count
    incorrect_count

    accuracy = 100 * (correct_count/actual.size(0))
    return accuracy

#one-off predictions for the test dataset
#assumes model is moved back to cpu first
def test(model, new_data):
    model.eval()
    with torch.no_grad():
        out = model(new_data)  # Compute scores
        model.train()
        return out

#test using the validation set
def validate(model, criterion, test_loader,device):
    test_loss = 0.
    test_preds, test_labels = list(), list()
    for i, data in enumerate(test_loader):
        x, labels = data
        x, labels = x.to(device), labels.to(device)
        with torch.no_grad():
          out = model(x)  # Compute scores
          test_loss += criterion(out, labels).item()
          test_preds.append(out)
          test_labels.append(labels)

    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)

    test_accuracy = binary_classification_accuracy(test_preds,test_labels)

    print('[VALIDATE] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/len(test_loader), test_accuracy))
    mean_loss = test_loss/len(test_loader)

    return test_accuracy, mean_loss

def train_cv(train_dataset,model,optimizer,epochs,k_folds,batch_size,device):
    # Initialize the k-fold cross validation
    kf = KFold(n_splits=k_folds, shuffle=True)

    # Loop through each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(train_dataset)):
        print(f"Fold {fold + 1}")
        print("-------")

        # Define the data loaders for the current fold
        train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,sampler=torch.utils.data.SubsetRandomSampler(train_idx),)
        test_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,sampler=torch.utils.data.SubsetRandomSampler(test_idx),)

        # Train the model on the current fold
        THRESHOLD = 90
        LOG_INTERVAL = 250
        MAX_VALIDATION_DECREASE = 50
        validation_decrease_count = 50 #keeps track how many times the validation accuracy worsened. Once it reaches max, stop
        prev_val_meanloss = 999
        validation_accuracy, validation_loss, running_loss, running_accuracy = list(), list(), list(), list()
        start_time = time.time()
        criterion = torch.nn.BCELoss() #includes the sigmoid function for binary classification

        losses = []

        for epoch in range(epochs):
            epoch_loss = 0.
            for i, data in enumerate(train_loader):  # Loop over elements in training set

                x, labels = data
                x, labels = x.to(device), labels.to(device)
                out = model(x)
                train_acc = binary_classification_accuracy(out,labels)
                loss = criterion(out,labels)

                loss.backward()               # Backward pass (compute parameter gradients)
                optimizer.step()              # Update weight parameter using SGD
                optimizer.zero_grad()         # Reset gradients to zero for next iteration

                # ============================================================================
                # You can safely ignore the boilerplate code below - just reports metrics over
                # training and test sets

                #running_loss.append(loss.item())
                running_accuracy.append(train_acc)
                epoch_loss += loss.item()

                if i % LOG_INTERVAL == 0:  # Log training stats
                    deltaT = time.time() - start_time
                    mean_loss = epoch_loss / (i+1)
                    losses.append(mean_loss)
                    print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch,
                        i, len(train_loader), mean_loss, train_acc, deltaT))

            print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

            validation_acc, validate_mean_loss = validate(model, criterion, test_loader,device)

            #stop if accuracy is greater than a threshold
            if validation_acc >= THRESHOLD:
              print("Required validation accuracy reached")
              break
            #stop if mean validation loss decreases consecutively
            if validate_mean_loss >= prev_val_meanloss:
                validation_decrease_count += 1

                if validation_decrease_count == MAX_VALIDATION_DECREASE:
                    print("Training stopped as validation accuracy is going down")
                    break
            else:
                validation_decrease_count = 0
                prev_val_meanloss = validate_mean_loss

            prev_val_accuracy = validation_acc
            validation_loss.append(validate_mean_loss)
            validation_accuracy.append(validation_acc)


        #plot loss graph
        plt.plot(losses, linestyle = 'dotted')
        plt.title("Train Loss")
        plt.show()

        plt.plot(validation_accuracy, linestyle = 'dotted')
        plt.title("Validation Accuracy")
        plt.show()

        plt.plot(validation_loss, linestyle = 'dotted')
        plt.title("Validation Loss")
        plt.show()


In [66]:
def validate_multi_output(model, criterion, criterion2, test_loader,device):
    test_loss = 0.
    test_preds, test_labels, model_loss = list(), list(), list()
    for i, data in enumerate(test_loader):
        x, labels,models = data
        x, labels, models = x.to(device), labels.to(device),models.to(device)

        with torch.no_grad():
          out,pred_model = model(x)  # Compute scores
          test_loss += criterion(out, labels)
          test_preds.append(out)
          test_labels.append(labels)

    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)

    test_accuracy = binary_classification_accuracy(test_preds,test_labels)

    print('[VALIDATE] Mean loss {:.4f} | Accuracy {:.4f} '.format(test_loss/len(test_loader), test_accuracy))
    mean_loss = test_loss/len(test_loader)
    return test_accuracy, mean_loss

def train_cv_multi_output(train_dataset,model,optimizer,epochs,k_folds,batch_size,device):
    # Initialize the k-fold cross validation
    kf = KFold(n_splits=k_folds, shuffle=True)

    # Loop through each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(train_dataset)):
        print(f"Fold {fold + 1}")
        print("-------")

        # Define the data loaders for the current fold
        train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,sampler=torch.utils.data.SubsetRandomSampler(train_idx),)
        test_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,sampler=torch.utils.data.SubsetRandomSampler(test_idx),)
        
        # Train the model on the current fold
        THRESHOLD = 95
        LOG_INTERVAL = 250
        MAX_VALIDATION_DECREASE = 30
        validation_decrease_count = 10 #keeps track how many times the validation accuracy worsened. Once it reaches max, stop
        prev_val_meanloss = 999

        validation_accuracy, validation_loss, running_loss, running_accuracy = list(), list(), list(), list()
        start_time = time.time()

        criterion = torch.nn.BCELoss() #includes the sigmoid function for binary classification
        criterion_model = torch.nn.CrossEntropyLoss()
        losses = []

        for epoch in range(epochs):
            epoch_loss = 0.
            for i, data in enumerate(train_loader):  # Loop over elements in training set

                x, labels,models = data
                x, labels,models = x.to(device), labels.to(device), models.to(device)
                out, pred_models = model(x) #automatically calls the forward function of the model

                train_acc = binary_classification_accuracy(out,labels)
                loss = criterion(out,labels)
                model_loss = criterion_model(pred_models,models)

                loss = loss + (model_loss*5)
                loss.backward()               # Backward pass (compute parameter gradients)
                optimizer.step()              # Update weight parameter using SGD
                optimizer.zero_grad()         # Reset gradients to zero for next iteration

                # ============================================================================
                # You can safely ignore the boilerplate code below - just reports metrics over
                # training and test sets
                running_accuracy.append(train_acc)
                epoch_loss += loss.item()

                if i % LOG_INTERVAL == 0:  # Log training stats
                    deltaT = time.time() - start_time
                    mean_loss = epoch_loss / (i+1)
                    losses.append(mean_loss)
                    print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch,
                        i, len(train_loader), mean_loss, train_acc, deltaT))

            print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

            validation_acc, validate_mean_loss = validate_multi_output(model, criterion, criterion_model, test_loader,device)

            #stop if accuracy is greater than a threshold
            if validation_acc >= THRESHOLD:
              print("Required validation accuracy reached")
              break
            #stop if mean validation loss decreases consecutively
            if validate_mean_loss >= prev_val_meanloss:
                validation_decrease_count += 1

                if validation_decrease_count == MAX_VALIDATION_DECREASE:
                    print("Training stopped as validation accuracy is going down")
                    break
            else:
                validation_decrease_count = 0
                prev_val_meanloss = validate_mean_loss

            prev_val_accuracy = validation_acc
            validation_loss.append(validate_mean_loss)
            validation_accuracy.append(validation_acc)


        #plot loss graph
        plt.plot(losses, linestyle = 'dotted')
        plt.title("Train Loss")
        plt.show()

        plt.plot(validation_accuracy, linestyle = 'dotted')
        plt.title("Validation Accuracy")
        plt.show()

        plt.plot(validation_loss, linestyle = 'dotted')
        plt.title("Validation Loss")
        plt.show()

### Models

In [64]:
class DeepNN(nn.Module):
    def __init__(self, input_dim,out_dim,vocab_size,embedding_vector_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_vector_size)
        self.dropout = nn.Dropout(0.5) #dropout to avoid overfitting
        self.layer1 = nn.Linear(input_dim, input_dim)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(input_dim, input_dim)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(input_dim, input_dim)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(input_dim, out_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)
        x = self.embedding(x)
        x = self.dropout(x) #remove unnecessary neurons
        #reshape x after dropout
        bs, _, _ = x.shape
        x = torch.nn.functional.adaptive_avg_pool1d(x, 1).reshape(bs, -1)
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_sequence):
        # apply GRU to full input sequence, and retain final hidden state
        _, hidden = self.gru(input_sequence)
        # couple final hidden state to multiclass classifier, i.e., softmax output
        output = self.h2o(hidden.view(1, -1))
        output = self.sigmoid(output)
        # output = F.log_softmax(output, dim=1)
        return output

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim,vocab_size, hidden_size, num_layers, bidirectional,dropout):
        super(LSTMModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size + 1,embedding_dim)

        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size,
                            num_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True)
        # Dense layer to predict
        self.fc = nn.Linear(hidden_size * 2,1)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)
        embedded = self.embedding(x)
        packed_output,(hidden_state,cell_state) = self.lstm(embedded)

        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)

        dense_outputs=self.fc(hidden)

        #Final activation function
        print(dense_outputs)
        outputs=self.sigmoid(dense_outputs)

        return outputs

#outputs two values, the model and the binary 0 or 1
class LSTMModel_MultiObjective(nn.Module):
    def __init__(self, embedding_dim,vocab_size, hidden_size, num_layers, bidirectional,dropout):
        super(LSTMModel_MultiObjective, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1,embedding_dim)

        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size,
                            num_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True)
        # Dense layer to predict
        self.fc = nn.Linear(hidden_size * 2,1)
        self.fc_models = nn.Linear(hidden_size * 2,8) #8 classes of models - 8 outputs
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)

        embedded = self.embedding(x)

        packed_output,(hidden_state,cell_state) = self.lstm(embedded)

        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)

        dense_outputs=self.fc(hidden)
        model_out=self.fc_models(hidden)

        human_ai_output =self.sigmoid(dense_outputs)
        return human_ai_output, model_out

class LogisticRegressionModel(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
         super(LogisticRegressionModel, self).__init__()
         self.linear = torch.nn.Linear(input_dim, output_dim)
     def forward(self, x):
         outputs = torch.sigmoid(self.linear(x))
        #  print(outputs)
         return outputs

# Neural Network should have a sigmoid activation function if you are using BCELoss()
class LinearModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LinearModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def get_weights(self):
        return self.weight

    def forward(self,x):
        # print(x.size())
        out = self.fc1(x)
        # out = self.relu(out)
        out = torch.sigmoid(self.fc2(out)) #sigmoid as we use BCELoss
        # print(out)
        return out

Run

In [None]:
#Test the DeepNN framework
#Test the DeepNN framework
k_folds = 5
batch_size = 100 #1 for gru
num_epochs = 50
max_col = 1075 #found from checking max length of all df domain1, domain2 and test_df e.g domain2_df["text"].str.len().max()
embedding_vector_size = 300 #each token will have a 128 vector in the embedding layer
vocab_size = 5000
num_features = max_col
hidden_size = 64 #32 used for domain2
BIDIRECTION = True
DROPOUT = 0.2
NUM_LAYERS =  1

#set to the length of the longest sample

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## ******* Logistic Regression Model
# model = LogisticRegressionModel(max_col,1)
# model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# # train_cv(domain2_equal_parts_ds,model,optimizer,num_epochs,k_folds,batch_size)
# train_cv(domain2_1_ds,model,optimizer,num_epochs,k_folds,batch_size,device)

# ## ******* Logistic Regression Model
# model = LogisticRegressionModel(max_col,1)
# model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# # train_cv(domain2_equal_parts_ds,model,optimizer,num_epochs,k_folds,batch_size)
# train_cv(domain1_ds,model,optimizer,num_epochs,k_folds,batch_size,device)

## ******* GRU Model
# model = GRUModel(num_features, hidden_size, 1)
# model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# #optimizer = optim.Adam(model.parameters(), lr=0.0001)
# train_cv(domain1_ds,model,optimizer,num_epochs,k_folds,batch_size)

## ******* LSTM Model - domain 1

# domain2_train = load_data('Holdout Data/domain2_train.json')
# domain2_df = l_df(domain2_train)
# domain2_1_ds = build_tensor_dataset(model1_df,"text","label",False,"padding",max_col)

#model = LSTMModel(embedding_vector_size,vocab_size, hidden_size, NUM_LAYERS,BIDIRECTION,DROPOUT)
# model = torch.load('Output/checkpoint2_model.pt')#
# optimizer =  torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# model.to(device)
# train_cv(holdout2_ds,model,optimizer,num_epochs,k_folds,batch_size,device)
# domain2_ds
# domain2_equal_parts_ds
# domain2_3_ds

# ******* LSTM Model - multiobjective

# domain2_train = load_data('Holdout Data/domain2_train.json')
# domain2_df = l_df(domain2_train)
# domain2_1_ds = build_tensor_dataset(model1_df,"text","label",False,"padding",max_col)

model = LSTMModel_MultiObjective(embedding_vector_size,vocab_size, hidden_size, NUM_LAYERS,BIDIRECTION,DROPOUT)
# model = torch.load('Output/checkpoint2_model.pt')#
optimizer =  torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
model.to(device)
train_cv_multi_output(domain2__multi_ds,model,optimizer,num_epochs,k_folds,batch_size,device)
# domain2_ds
# domain2_equal_parts_ds
# domain2_3_ds

## ******* LSTM Model - domain 2

# num_epochs = 200

# #model = torch.load('Output/checkpoint1_domain1.pt')#
# model = LSTMModel(embedding_vector_size,vocab_size, hidden_size, NUM_LAYERS,BIDIRECTION,DROPOUT)
# model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# # num_epochs = 200
# # batch_size = 5

# # model = LogisticRegressionModel(max_col,1)
# # model.to(device)
# # optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
# # # train_cv(domain2_equal_parts_ds,model,optimizer,num_epochs,k_folds,batch_size)
# # train_cv(domain1_ds,model,optimizer,num_epochs,k_folds,batch_size,device)


# # Randomly sample datasets from domain2 and pass for model training
# #reduce the num epochs as training will be for many datasets
# for i, (data, target) in enumerate(train_loader):
#     print("batch index {}, 0/1: {}/{}".format(
#         i,
#         len(np.where(target.numpy() == 0)[0]),
#         len(np.where(target.numpy() == 1)[0])))
#     new_ds = TensorDataset(data.float(),target.float())
#     train_cv(new_ds,model,optimizer,num_epochs,k_folds,batch_size,device)

#     del new_ds

## ********* LSTM Stacking Ensemble

# num_epochs = 5
# batch_size = 4

# model1 = torch.load('Output/domain1_model_fixedpadding.pt')
# model2 = torch.load('Output/domain2_model_fixedpadding.pt')
# model1.to(device)
# model2.to(device)

# for param in model1.parameters():
#     param.requires_grad_(False)

# for param in model2.parameters():
#     param.requires_grad_(False)

# model1.eval()
# model2.eval()

# embedding_vector_size = 128 #each token will have a 128 vector in the embedding layer

# model = torch.load('Output/meta_model_3_758.pt')#
# # model = EnsembleModel_2Models(model1,model2,embedding_vector_size * 2,1,hidden_size)
# # model = EnsembleModel_2Models_LogisticRegression(model1,model2)

# model.to(device)
# # for name, param in model.named_parameters():
# #     if param.requires_grad:
# #         print(name, param.data)
# optimizer =  torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


# # ##train over a combined dataset
# train_cv(holdout2_ds,model,optimizer,num_epochs,k_folds,batch_size,device)
## ******* Deep FNN

# # train_cv(ds,model,optimizer,num_epochs,k_folds,batch_size)
# domain1_model = DeepNN(num_features, 1,vocab_size,embedding_vector_size)
# domain1_model.to(device)
# domain1_model_optimizer = optim.Adam(domain1_model.parameters(), lr=0.0001)

# train_cv(domain1_ds,domain1_model,domain1_model_optimizer,num_epochs,k_folds,batch_size)

# for param in domain1_model.parameters():
#     param.requires_grad_(False)


#Save entire model to file
torch.save(model, 'model_backup.pt')

# #Test output
model.to("cpu")
prediction = test(model,test_data).round()
np.savetxt('test_out.txt', prediction.cpu().numpy(),fmt="%d")

del model


### Ensemble Models

In [7]:
class EnsembleModel_2Models_CombineOutput(nn.Module):
    def __init__(self, modelA, modelB, input_dim, output_dim):
        super(EnsembleModel_2Models_CombineOutput, self).__init__()
        self.modelA = modelA
        self.modelB = modelB

    def forward(self, x):
        #make prediction using each model
        x1 = self.modelA(x.clone())
        x2 = self.modelB(x.clone())

        print("x1")
        print(x1)
        print("x2")
        print(x2)
        #concatenate outputs horizontally. I.e x will look like [[0 1],[1 1],..,[1 1]]
        x = torch.cat((x1, x2), dim=1) #x n * 2 matrix
        print("xt")

        print(x)
        xt = torch.max(x, dim=1)
        print("xt")
        print(xt)

        xt = torch.reshape(xt, (1000,1))
        return xt
#contains a simple feedforward network to learn how to use the base models to predict new data
class EnsembleModel_2Models(nn.Module):
    def __init__(self, modelA, modelB, input_dim, output_dim,hidden_size):
        super(EnsembleModel_2Models, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        # Remove last linear layer
        self.modelA.fc = nn.Identity()
        self.modelA.sigmoid = nn.Identity()
        self.modelB.fc = nn.Identity()
        self.modelB.sigmoid = nn.Identity()
        # one hidden layer and an output layer for simple feed-forward network
        self.hidden = nn.Linear(hidden_size * 4, 150) #hidden nuurons = 2/3 of input
        self.hidden_act = nn.ReLU()
        self.hidden2 = nn.Linear(150, 150) #hidden nuurons = 2/3 of input
        self.hidden2_act = nn.ReLU()
        self.hidden3 = nn.Linear(150, 70) #hidden nuurons = 2/3 of input
        self.hidden3_act = nn.ReLU()
        self.output = nn.Linear(70, 1)
        self.output_act = nn.Sigmoid()

    def forward(self, x):
        #make prediction using each model
        x1 = self.modelA(x.clone())
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x.clone())
        x2 = x2.view(x2.size(0), -1)
        x = torch.cat((x1, x2), dim=1) #x n * 2 matrix
        outputs = self.hidden(x)
        outputs = self.hidden_act(outputs)
        outputs = self.hidden2(outputs)
        outputs = self.hidden2_act(outputs)
        outputs = self.hidden3(outputs)
        outputs = self.hidden3_act(outputs)
        outputs = self.output(outputs)
        outputs = self.output_act(outputs)
        return outputs

class EnsembleModel_2Models_LogisticRegression(nn.Module):
    def __init__(self, modelA, modelB):
        super(EnsembleModel_2Models_LogisticRegression, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        # Remove last linear layer
        self.modelA.fc = nn.Identity()
        self.modelA.sigmoid = nn.Identity()
        self.modelB.fc = nn.Identity()
        self.modelB.sigmoid = nn.Identity()
        self.output = nn.Linear(hidden_size * 4, 1)
        self.output_act = nn.Sigmoid()

    def forward(self, x):
        #make prediction using each model
        x1 = self.modelA(x.clone())
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x.clone())
        x2 = x2.view(x2.size(0), -1)

        x = torch.cat((x1, x2), dim=1) #concat probabilities
        outputs = self.output(x)
        outputs = self.output_act(outputs)

        return outputs

class EnsembleModel_3Models(nn.Module):
    def __init__(self, modelA, modelB, modelC, input_dim, output_dim):
        super(EnsembleModel_3Models, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelC = modelC
        # Remove last linear layer
        self.modelA.sigmoid = nn.Identity()
        self.modelB.sigmoid = nn.Identity()
        self.modelC.sigmoid = nn.Identity()
        # Create new classifier
        self.classifier = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        #make prediction using each model
        x1 = self.modelA(x.clone())
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x.clone())
        x2 = x2.view(x2.size(0), -1)
        x3 = self.modelC(x.clone())
        x3 = x3.view(x3.size(0), -1)

        #concatenate outputs horizontally. I.e x will look like [[0 1],[1 1],..,[1 1]]
        x = torch.cat((x1, x2, x3), dim=1) #x n * 2 matrix
        xt = torch.add(x1, x2) #sum linear layer outputs
        xt = torch.add(xt, x3) #sum linear layer outputs
        xt = torch.div(xt, 3) #average out
        outputs = torch.sigmoid(xt)
        return outputs




### Load a model from file and test

In [None]:
model1 = torch.load('Output/checkpoint3.pt',map_location=torch.device('cpu'))

for param in model1.parameters():
    param.requires_grad_(False)

model1.eval()
embedding_vector_size = 128 #each token will have a 128 vector in the embedding layer

prediction = test(model,test_data).round()
np.savetxt('Output/test_out.txt', prediction.cpu().numpy(),fmt="%d")


In [None]:
# Test against a specific data source

max_col = 1075
model1 = torch.load('Output/checkpoint2_model.pt',map_location=torch.device('cpu'))

for param in model1.parameters():
    param.requires_grad_(False)

dataloader = get_torch_data_loader(domain1_df,"text","label",False,max_col,batch_size=1000,shuffle=True)

for i, data in enumerate(dataloader):  # Loop over elements in training set
    x, labels = data

    prediction = test(model1,x).round()
    test_accuracy = binary_classification_accuracy(prediction,labels)

    print('[TEST] Accuracy {:.4f}'.format(test_accuracy))

del model1