#Preliminary Imports and Installations

In [None]:
!pip install transformers
!pip install datasets
!pip install nlpaug

import numpy as np
import pandas as pd
import math

import torch
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR


from datasets import load_dataset

!cp /content/drive/MyDrive/UG_Project/Code/augment.py /content/
!cp /content/drive/MyDrive/UG_Project/Code/eda.py /content/
from eda import eda
from augment import *
import nltk
nltk.download('wordnet')

import pickle

import tensorflow as tf
import tensorboard as tb

import gc

from sklearn.utils import shuffle

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Loading Dataset

In [None]:
def load_data_into_df(dataset_name):
  print("loading dataset...")
  dataset = load_dataset(dataset_name)
  print("loading done")

  train_ds = dataset['train']
  test_ds = dataset['test']
  unlabeled_ds = dataset['unsupervised']

  train_df = pd.DataFrame(train_ds)
  test_df = pd.DataFrame(test_ds)
  unlabeled_df = pd.DataFrame(unlabeled_ds)

  return train_df, test_df, unlabeled_df

In [None]:
train_df, test_df, unlabeled_df = load_data_into_df('imdb')
train_df.shape

loading dataset...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1916.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054.0, style=ProgressStyle(description…


Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=84125825.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.
loading done


(25000, 2)

In [None]:
test_df.shape

(25000, 2)

#Preprocessing
Computing Embeddings for each sentence with additional data to perfrom augmentation

In [None]:
#number augmented sentences to be created for each sentence
augment_rate = 7
# augment_scale = augment_scale
augment_strength = 0.2

In [None]:
augmenter1 = EDAAugmenter(augment_strength)

In [None]:
def get_sentence_embeddings(sentences):
    # Tokenize sentences
    # print("tokenizing")
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to('cuda')
    # print("done")

    # print("running through bert")
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    # print("done")
    return model_output[0][:,0]

In [None]:
def augment_and_save(x_data, y_data, fname):
  data_size = len(x_data)
  # ret = [sentence, sentence embedding, aug_sentence_embeddings, 
  #        mean augmented embedding, U, S, V, label]
  ret = []
  ret.append(x_data)

  batch_embeddings = []
  embedding_means = []
  main_sentences_em = []
  aug_sentences_em = []
    
  augmented_sentences = []
  #augmenting
  print("augmenting")
  for i in range(data_size):
    sentence = x_data[i]
    augmented_x = [sentence] + augmenter1(sentence, augment_rate)
    augmented_sentences = augmented_sentences + augmented_x

  # print("augmented_sentences length:", len(augmented_sentences))
  
  #getting sentence embeddings
  print("getting sentence embeddings")
  embeddings = []
  step = 8
  total_intervals = math.ceil(len(augmented_sentences)/step)
  slash_cnt = 0 
  for i in range(total_intervals):
    while slash_cnt < i*50/total_intervals:
      print("|", end="")
      slash_cnt += 1
    start = i*step
    end = min(start+step, len(augmented_sentences))
    tmp_embeddings = get_sentence_embeddings(augmented_sentences[start:end]) 
    embeddings.append(tmp_embeddings)
  embeddings = torch.cat(embeddings)
  print()
  # print("done getting embeddings")

  # print("embeddings shape: ", embeddings.shape)

  print("augmenting on embeddings")
  for i in range(data_size):
    total_sentences = augment_rate+1
    s_embeddings = embeddings[i*total_sentences:(i+1)*total_sentences,:]
    main_sentences_em.append(s_embeddings[0:1, :])
    aug_sentences_em.append(s_embeddings[1:8, :])
    mean = torch.mean(s_embeddings, 0, True)
    batch_embeddings.append(s_embeddings-mean)
    embedding_means.append(mean)
  # print("done augmenting on embedding")
  
  main_sentences_em = torch.cat(main_sentences_em).cpu()
  aug_sentences_em = torch.cat(aug_sentences_em).cpu()
  embedding_means = torch.cat(embedding_means).cpu()
  ret.append(main_sentences_em.detach().numpy())
  ret.append(aug_sentences_em.detach().numpy())
  ret.append(embedding_means.detach().numpy())

  batch_embeddings = torch.stack(batch_embeddings)
  #performing SVD
  print("performing svd")
  Us, Ss, Vs = torch.linalg.svd(batch_embeddings, False)
  # print("done")

  # print("Us:", Us.size())
  # print("Ss:", Ss.size())
  # print("Vs:", Vs.size())

  ret.append(Us.cpu().detach().numpy())
  ret.append(Ss.cpu().detach().numpy())
  ret.append(Vs.cpu().detach().numpy())
  ret.append(y_data)
  
  
  with open(fname, 'wb') as f:
    pickle.dump(ret, f)
  
  return 0

In [None]:
# preprocessing and saving to drive
path = '/content/drive/MyDrive/UG_Project/imdb_processed_2/'
train_X = list(train_df['text'].values)
train_y = list(train_df['label'].values) 
for i in range(25):
  fname = path + "train_" + str(i) + ".pickle"
  print("starting "+fname)
  start = i*1000
  end = start + 1000
  augment_and_save(train_X[start:end], train_y[start:end], fname)
  print(fname+" done")
  print("#"*50, end="\n\n")

  torch.cuda.empty_cache()

NameError: ignored

In [None]:
# preprocessing and saving to drive
path = '/content/drive/MyDrive/UG_Project/imdb_processed_2/'
test_X = list(test_df['text'].values)
test_y = list(test_df['label'].values) 
for i in range(25):
  fname = path + "test_" + str(i) + ".pickle"
  print("starting "+fname)
  start = i*1000
  end = start + 1000
  augment_and_save(test_X[start:end], test_y[start:end], fname)
  print(fname+" done")
  print("#"*100, end="\n\n")

  torch.cuda.empty_cache()

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/test_20.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/test_20.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/test_21.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/test_21.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/test_22.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_

In [None]:
# preprocessing and saving to drive
path = '/content/drive/MyDrive/UG_Project/imdb_processed_2/'
u_X = list(unlabeled_df['text'].values)
u_y = list(unlabeled_df['label'].values) 
for i in range(25):
  fname = path + "u_" + str(i) + ".pickle"
  print("starting "+fname)
  start = i*1000
  end = start + 1000
  augment_and_save(u_X[start:end], u_y[start:end], fname)
  print(fname+" done")
  print("#"*100, end="\n\n")

  torch.cuda.empty_cache()

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_0.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_0.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_1.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_1.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_2.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_2.pick

In [None]:
for i in range(25, 50):
  fname = path + "u_" + str(i) + ".pickle"
  print("starting "+fname)
  start = i*1000
  end = start + 1000
  augment_and_save(u_X[start:end], u_y[start:end], fname)
  print(fname+" done")
  print("#"*100, end="\n\n")

  torch.cuda.empty_cache()

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_25.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_25.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_26.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_26.pickle done
####################################################################################################

starting /content/drive/MyDrive/UG_Project/imdb_processed_2/u_27.pickle
augmenting
getting sentence embeddings
||||||||||||||||||||||||||||||||||||||||||||||||||
augmenting on embeddings
performing svd
/content/drive/MyDrive/UG_Project/imdb_processed_2/u_2

#Loading From File

In [None]:
def load_from_file(path):
  train_data = dict()
  test_data = dict()

  # loading preprocessed dataset from drive
  # ret = [sentence, sentence embedding, aug_sentence_embeddings, 
  #        mean augmented embedding, U, S, V, label]

  ####################
  #### TRAIN DATA ####

  sentences = []          # original sentences
  s_embeddings = []       # sentence embeddings
  a_s_embeddings = []     # augmented sentence embeddings
  m_a_embeddings = []     # mean augmented embeddings
  Us = []                 # U of SVD: 100*8*8
  Ss = []                 # S of SVD: ???
  Vs = []                 # Vh of SVD: 1000x8x768 each
  y = []                  # labels

  print("loading train data ", end="")
  for i in range(25):
    print(".", end="")
    fpath = path+"train_"+str(i)+".pickle"
    with open(fpath, 'rb') as f:
      data = pickle.load(f)
    
    sentences = sentences + data[0]
    s_embeddings.append(data[1])
    a_s_embeddings.append(data[2])
    m_a_embeddings.append(data[3])
    Us.append(data[4])
    Ss.append(data[5])
    Vs.append(data[6])
    y = y + data[7]
  print(" done")

  sentences = np.array(sentences).reshape(-1, 1)
  s_embeddings = np.concatenate(s_embeddings)

  a_s_embeddings = np.concatenate(a_s_embeddings)
  tmp_a_s_em = []
  for i in range(a_s_embeddings.shape[0]//7):
    tmp_a_s_em.append(a_s_embeddings[i:i+7,:].copy())
  a_s_embeddings = np.stack(tmp_a_s_em)
  
  m_a_embeddings = np.concatenate(m_a_embeddings)
  Us = np.concatenate(Us)
  Ss = np.concatenate(Ss)
  Vs = np.concatenate(Vs)
  y = np.array(y).reshape(-1, 1)

  train_data['sentences'] = sentences
  train_data['s_embeddings'] = s_embeddings
  train_data['a_s_embeddings'] = a_s_embeddings
  train_data['m_a_embeddings'] = m_a_embeddings
  train_data['Us'] = Us
  train_data['Ss'] = Ss
  train_data['Vs'] = Vs
  train_data['y'] = y


  ###################
  #### TEST DATA ####

  test_sentences = []
  test_s_embeddings = []        # sentence embeddings
  test_a_s_embeddings = []      # augmented sentence embeddings
  test_m_a_embeddings = []      # mean augmented embeddings
  test_Us = []                  # U of SVD
  test_Ss = []                  # S of SVD
  test_Vs = []                  # Vh of SVD: 1000x8x768 each
  test_y = []                   # labels


  print("loading test data ", end="")
  for i in range(25):
    print(".", end="")
    fpath = path+"test_"+str(i)+".pickle"
    with open(fpath, 'rb') as f:
      data = pickle.load(f)
    
    test_sentences = test_sentences + data[0]
    test_s_embeddings.append(data[1])
    test_a_s_embeddings.append(data[2])
    test_m_a_embeddings.append(data[3])
    test_Us.append(data[4])
    test_Ss.append(data[5])
    test_Vs.append(data[6])
    test_y = test_y + data[7]
  print(" done")

  test_sentences = np.array(test_sentences).reshape(-1, 1)
  test_s_embeddings = np.concatenate(test_s_embeddings)

  test_a_s_embeddings = np.concatenate(test_a_s_embeddings)
  tmp_test_a_s_em = []
  for i in range(test_a_s_embeddings.shape[0]//7):
    tmp_test_a_s_em.append(test_a_s_embeddings[i:i+7,:].copy())
  test_a_s_embeddings = np.stack(tmp_test_a_s_em)
  
  test_m_a_embeddings = np.concatenate(test_m_a_embeddings)
  test_Us = np.concatenate(test_Us)
  test_Ss = np.concatenate(test_Ss)
  test_Vs = np.concatenate(test_Vs)
  test_y = np.array(test_y).reshape(-1, 1)

  test_data['sentences'] = test_sentences
  test_data['s_embeddings'] = test_s_embeddings
  test_data['a_s_embeddings'] = test_a_s_embeddings
  test_data['m_a_embeddings'] = test_m_a_embeddings
  test_data['Us'] = test_Us
  test_data['Ss'] = test_Ss
  test_data['Vs'] = test_Vs
  test_data['y'] = test_y

  return train_data, test_data

In [None]:
def load_unlabeled_data(path, partition_no):
  u_data = dict()
  ########################
  #### Unlabeled DATA ####

  fpath = path+"u_"+str(partition_no)+".pickle"
  # print(f"loading unlabeled partionion {fpath}", end="")
  with open(fpath, 'rb') as f:
    data = pickle.load(f)
  
  u_sentences = data[0]           # original sentences
  u_s_embeddings = data[1]        # sentence embeddings
  u_a_s_embeddings = data[2]      # augmented sentence embeddings
  u_m_a_embeddings = data[3]      # mean augmented embeddings
  u_Us = data[4]                  # U of SVD
  u_Ss = data[5]                  # S of SVD
  u_Vs = data[6]                  # Vh of SVD: 1000x8x768 each
  # print(" done")

  u_sentences = np.array(u_sentences).reshape(-1, 1)
  u_s_embeddings = np.array(u_s_embeddings)

  u_a_s_embeddings = np.array(u_a_s_embeddings)
  tmp_u_a_s_em = []
  for i in range(u_a_s_embeddings.shape[0]//7):
    tmp_u_a_s_em.append(u_a_s_embeddings[i:i+7,:].copy())
  u_a_s_embeddings = np.stack(tmp_u_a_s_em)
  
  u_m_a_embeddings = np.array(u_m_a_embeddings)
  u_Us = np.array(u_Us)
  u_Ss = np.array(u_Ss)
  u_Vs = np.array(u_Vs)
  # u_y = np.array(test_y).reshape(-1, 1)

  u_data['sentences'] = u_sentences
  u_data['s_embeddings'] = u_s_embeddings
  u_data['a_s_embeddings'] = u_a_s_embeddings
  u_data['m_a_embeddings'] = u_m_a_embeddings
  u_data['Us'] = u_Us
  u_data['Ss'] = u_Ss
  u_data['Vs'] = u_Vs
  # u_data['y'] = u_y

  return u_data

In [None]:
def shuffle_arrays(arrays, set_seed=-1):
    """Shuffles arrays in-place, in the same order, along axis=0

    Parameters:
    -----------
    arrays : List of NumPy arrays.
    set_seed : Seed value if int >= 0, else seed is random.
    """
    assert all(len(arr) == len(arrays[0]) for arr in arrays)
    seed = np.random.randint(0, 2**(32 - 1) - 1) if set_seed < 0 else set_seed

    for arr in arrays:
        rstate = np.random.RandomState(seed)
        rstate.shuffle(arr)

In [None]:
def split_train_dev(data, train_ratio = 0.004):
  train_data = dict()
  dev_data = dict()

  sentences = data['sentences']
  s_embeddings = data['s_embeddings']
  a_s_embeddings = data['a_s_embeddings']
  m_a_embeddings = data['m_a_embeddings']
  Us = data['Us']
  Ss = data['Ss']
  Vs = data['Vs']
  y = data['y']


  #shuffling
  arrays = [sentences, s_embeddings, a_s_embeddings, m_a_embeddings, Us, Ss, Vs, y]
  shuffle_arrays(arrays)

  # seed = np.random.randint(0, 2000)
  # np.random.seed(seed)
  # np.random.shuffle(sentences)
  # np.random.seed(seed)
  # np.random.shuffle(s_embeddings)
  # np.random.seed(seed)
  # np.random.shuffle(a_s_embeddings)
  # np.random.seed(seed)
  # np.random.shuffle(m_a_embeddings)
  # np.random.seed(seed)
  # np.random.shuffle(Us)
  # np.random.seed(seed)
  # np.random.shuffle(Ss)
  # np.random.seed(42)
  # np.random.shuffle(Vs)
  # np.random.seed(42)
  # np.random.shuffle(y)

  rows = sentences.shape[0]
  train_rows = int(rows*train_ratio)

  train_data['sentences'] = sentences[:train_rows, :]
  train_data['s_embeddings'] = s_embeddings[:train_rows,:]
  train_data['a_s_embeddings'] = a_s_embeddings[:train_rows,:]
  train_data['m_a_embeddings'] = m_a_embeddings[:train_rows,:]
  train_data['Us'] = Us[:train_rows,:,:]
  train_data['Ss'] = Ss[:train_rows,:]
  train_data['Vs'] = Vs[:train_rows,:,:]
  train_data['y'] = y[:train_rows,:]

  dev_data['sentences'] = sentences[train_rows:, :]
  dev_data['s_embeddings'] = s_embeddings[train_rows:,:]
  dev_data['a_s_embeddings'] = a_s_embeddings[train_rows:,:]
  dev_data['m_a_embeddings'] = m_a_embeddings[train_rows:,:]
  dev_data['Us'] = Us[train_rows:,:,:]
  dev_data['Ss'] = Ss[train_rows:,:]
  dev_data['Vs'] = Vs[train_rows:,:,:]
  dev_data['y'] = y[train_rows:,:]
  
  return train_data, dev_data

In [None]:
all_train_data, test_data = load_from_file('/content/drive/MyDrive/UG_Project/imdb_processed_2/')

loading train data ......................... done
loading test data ......................... done


#Datasets and Dataloaders

In [None]:
class DoubleA_Dataset(torch.utils.data.IterableDataset):
  def __init__(self, data_dic, augment_prob, augment_strength, augment_rate):
    # format = [sentence, sentence embedding, aug_sentence_embeddings, 
    #           mean augmented embedding, U, S, V, label]
    self.sentences = data_dic['sentences']                    # original sentences
    self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
    self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
    self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
    self.Us = data_dic['Us']                                  # Us
    self.Ss = data_dic['Ss']                                  # Ss
    self.Vs = data_dic['Vs']                                  # Vs
    self.y = data_dic['y']                                    # labels

    self.augment_rate = augment_rate + 1
    self.augment_prob = augment_prob
    self.augment_strength = augment_strength
    self.data_len = self.sentences.shape[0]


  def augment(self, idx):
    m_a_embedding = self.m_a_embeddings[idx:idx+1,:]
   
    U = self.Us[idx,:,:]
    S = np.diag(self.Ss[idx,:])
    V = self.Vs[idx,:,:]
    
    latent_embeddings = np.dot(U, S)
    
    m_l_embedding = latent_embeddings.mean(axis=0, keepdims=True)
    l_embedding_std = latent_embeddings.std(axis=0, keepdims=True)

    noise = np.random.randn(1, m_l_embedding.shape[1])*l_embedding_std*self.augment_strength
    aug_l_embedding = m_l_embedding + noise
    
    ret = m_a_embedding + np.dot(aug_l_embedding, V)
    return ret.reshape(-1,).astype(np.float32)



  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    
    if worker_info is None:
      #if only one worker
      for i in range(self.data_len):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i), self.y[i]
        else:
          yield self.s_embeddings[i], self.y[i]
    
    else:
      #multiple workers
      per_worker = int(math.ceil(self.data_len / float(worker_info.num_workers)))
      worker_id = worker_info.id
      iter_start = worker_id * per_worker
      iter_end = min(iter_start + per_worker, self.data_len)
      
      for i in range(iter_start, iter_end):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i), self.y[i]
        else:
          yield self.s_embeddings[i], self.y[i]
  

  def __len__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
          return self.data_len
        else:
          return int(math.ceil(self.data_len / float(worker_info.num_workers)))

In [None]:
class Mixup_Dataset(torch.utils.data.IterableDataset):
  def __init__(self, data_dic, augment_prob, alphas, augment_rate):
    # format = [sentence, sentence embedding, aug_sentence_embeddings, 
    #           mean augmented embedding, U, S, V, label]
    # self.sentences = data_dic['sentences']                    # original sentences
    self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
    # self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
    # self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
    # self.Us = data_dic['Us']                                  # Us
    # self.Ss = data_dic['Ss']                                  # Ss
    # self.Vs = data_dic['Vs']                                  # Vs
    self.y = data_dic['y']                                    # labels


    self.augment_rate = augment_rate + 1
    self.augment_prob = augment_prob
    # self.augment_strength = augment_strength
    self.alphas = alphas
    self.data_len = self.s_embeddings.shape[0]

    #shuffling
    shuffle_arrays([self.s_embeddings, self.y])    


  def augment(self, idx):
    emb = self.s_embeddings[idx,:]
    label1 = self.y[idx]

    rand_idx = np.random.randint(0,self.data_len)
    while rand_idx == idx:
      rand_idx = np.random.randint(0,self.data_len)
    
    emb2 = self.s_embeddings[rand_idx, :]
    label2 = self.y[rand_idx]

    lam = np.random.beta(self.alphas[0], self.alphas[1])
    
    ret = emb*lam + emb2*(1-lam)
    label = label1*lam + label2*(1-lam) 
    return ret.reshape(-1,).astype(np.float32), label



  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    
    if worker_info is None:
      #if only one worker
      for i in range(self.data_len):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
    
    else:
      #multiple workers
      per_worker = int(math.ceil(self.data_len / float(worker_info.num_workers)))
      worker_id = worker_info.id
      iter_start = worker_id * per_worker
      iter_end = min(iter_start + per_worker, self.data_len)
      
      for i in range(iter_start, iter_end):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
  

  def __len__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
          return self.data_len
        else:
          return int(math.ceil(self.data_len / float(worker_info.num_workers)))

In [None]:
class Stitchup_Dataset(torch.utils.data.IterableDataset):
  def __init__(self, data_dic, augment_prob, alphas, augment_rate):
    # format = [sentence, sentence embedding, aug_sentence_embeddings, 
    #           mean augmented embedding, U, S, V, label]
    # self.sentences = data_dic['sentences']                    # original sentences
    self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
    # self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
    # self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
    # self.Us = data_dic['Us']                                  # Us
    # self.Ss = data_dic['Ss']                                  # Ss
    # self.Vs = data_dic['Vs']                                  # Vs
    self.y = data_dic['y']                                    # labels


    self.augment_rate = augment_rate + 1
    self.augment_prob = augment_prob
    self.alphas = alphas
    self.data_len = self.s_embeddings.shape[0]

    #shuffling
    shuffle_arrays([self.s_embeddings, self.y])
    

  def augment(self, idx):
    emb = self.s_embeddings[idx:idx+1,:]
    label1 = self.y[idx]

    rand_idx = np.random.randint(0,self.data_len)
    while rand_idx == idx:
      rand_idx = np.random.randint(0,self.data_len)
    
    emb2 = self.s_embeddings[rand_idx:rand_idx+1, :]
    label2 = self.y[rand_idx]

    lam = np.random.beta(self.alphas[0], self.alphas[1])
    
    select = [1 if np.random.rand() < lam else 0 for i in range(emb.shape[1])]
    select = np.array(select)
    ret = emb*select + emb2*(1-select)
    label = label1*lam + label2*(1-lam) 
    return ret.reshape(-1,).astype(np.float32), label



  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    
    if worker_info is None:
      #if only one worker
      for i in range(self.data_len):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
    
    else:
      #multiple workers
      per_worker = int(math.ceil(self.data_len / float(worker_info.num_workers)))
      worker_id = worker_info.id
      iter_start = worker_id * per_worker
      iter_end = min(iter_start + per_worker, self.data_len)
      
      for i in range(iter_start, iter_end):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
  

  def __len__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
          return self.data_len
        else:
          return int(math.ceil(self.data_len / float(worker_info.num_workers)))

In [None]:
class EDAOnly_Dataset(torch.utils.data.IterableDataset):
  def __init__(self, data_dic, augment_prob, augment_rate):
    # format = [sentence, sentence embedding, aug_sentence_embeddings, 
    #           mean augmented embedding, U, S, V, label]
    # self.sentences = data_dic['sentences']                    # original sentences
    self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
    self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
    # self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
    # self.Us = data_dic['Us']                                  # Us
    # self.Ss = data_dic['Ss']                                  # Ss
    # self.Vs = data_dic['Vs']                                  # Vs
    self.y = data_dic['y']                                    # labels


    self.augment_rate = augment_rate + 1
    self.augment_prob = augment_prob
    self.data_len = self.s_embeddings.shape[0]

    #shuffling
    shuffle_arrays([self.s_embeddings, self.a_s_embeddings, self.y])


  def augment(self, idx):
    s_embedding = self.s_embeddings[idx:idx+1,:]
    a_s_embeddings = self.a_s_embeddings[idx, :, :]
    embeddings = np.concatenate([s_embedding, a_s_embeddings])
    rand_idx = np.random.randint(0, self.augment_rate)

    ret = embeddings[rand_idx,:]
    label = self.y[idx]
    return ret.reshape(-1,).astype(np.float32), label



  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    
    if worker_info is None:
      #if only one worker
      for i in range(self.data_len):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
    
    else:
      #multiple workers
      per_worker = int(math.ceil(self.data_len / float(worker_info.num_workers)))
      worker_id = worker_info.id
      iter_start = worker_id * per_worker
      iter_end = min(iter_start + per_worker, self.data_len)
      
      for i in range(iter_start, iter_end):
        if np.random.rand() < self.augment_prob:
          yield self.augment(i)
        else:
          yield self.s_embeddings[i], self.y[i]
  

  def __len__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
          return self.data_len
        else:
          return int(math.ceil(self.data_len / float(worker_info.num_workers)))

#Classifier Model

In [None]:
class ClassifierModel(torch.nn.Module):
  def __init__(self, h1, h2, output_dim):
    super(ClassifierModel, self).__init__()

    self.linear1 = torch.nn.Linear(768, h1)
    self.dropout1 = torch.nn.Dropout(0.2)
    self.activation1 = torch.nn.ReLU()
    
    self.linear2 = torch.nn.Linear(h1, h2)
    self.dropout2 = torch.nn.Dropout(0.2)
    self.activation2 = torch.nn.ReLU()

    self.linear3 = torch.nn.Linear(h2, output_dim)

    torch.nn.init.xavier_uniform_(self.linear1.weight)
    torch.nn.init.zeros_(self.linear1.bias)
    torch.nn.init.xavier_uniform_(self.linear2.weight)
    torch.nn.init.zeros_(self.linear2.bias)
    torch.nn.init.xavier_uniform_(self.linear3.weight)
    torch.nn.init.zeros_(self.linear3.bias)
  

  def forward(self, embedding_batch):
    #embedding_batch: [batch_size, embedding_length]
    l1_out = self.linear1(embedding_batch)
    l1_drop = self.dropout1(l1_out)
    l1_act = self.activation1(l1_drop)

    l2_out = self.linear2(l1_act)
    l2_drop = self.dropout2(l2_out)
    l2_act = self.activation2(l2_drop)

    out = self.linear3(l2_act)
    return out
  

  def reset(self):
    torch.nn.init.xavier_uniform_(self.linear1.weight)
    torch.nn.init.zeros_(self.linear1.bias)
    torch.nn.init.xavier_uniform_(self.linear2.weight)
    torch.nn.init.zeros_(self.linear2.bias)
    torch.nn.init.xavier_uniform_(self.linear3.weight)
    torch.nn.init.zeros_(self.linear3.bias)

In [None]:
net = ClassifierModel(250, 250, 2).to(device)
net

ClassifierModel(
  (linear1): Linear(in_features=768, out_features=250, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (activation1): ReLU()
  (linear2): Linear(in_features=250, out_features=250, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (activation2): ReLU()
  (linear3): Linear(in_features=250, out_features=2, bias=True)
)

#Training

In [None]:
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

In [None]:
def get_accuracy(loader, net):
  total = 0
  correct = 0
  with torch.no_grad():
    for data in loader:
        X, y = data[0].to(device), data[1].to(device)
        y_pred = net(X)
        _, y_pred = torch.max(y_pred, 1, True)
        y_pred = y_pred.int()

        correct += (y_pred == y).sum().item()
        total += y.size(0)
  return correct*100.0/total

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir /content/drive/MyDrive/UG_Project/runs

In [None]:
gc.collect()

592

In [None]:
train_data, dev_data = split_train_dev(all_train_data, 0.004)

In [None]:
torch.cuda.empty_cache()

### no augment and doubleA
train_ds_for_test = DoubleA_Dataset(train_data, augment_prob=0,
                          augment_strength=0, augment_rate=7)
train_dataset = DoubleA_Dataset(train_data, augment_prob=0,
                          augment_strength=0, augment_rate=7)
dev_dataset = DoubleA_Dataset(dev_data, augment_prob=0,
                              augment_strength=0, augment_rate=7)
test_dataset = DoubleA_Dataset(test_data, augment_prob=0,
                              augment_strength=0, augment_rate=7)

### EDA only
# train_dataset = EDAOnly_Dataset(train_data, augment_prob=1, augment_rate=7)
# dev_dataset = EDAOnly_Dataset(dev_data, augment_prob=0, augment_rate=7)
# test_dataset = EDAOnly_Dataset(test_data, augment_prob=0, augment_rate=7)

### Mix-up & Stitchup
# alphas = (0.1, 0.1)
# train_ds_for_test = Stitchup_Dataset(train_data, augment_prob=0,
#                               alphas=alphas, augment_rate=7) 
# train_dataset = Stitchup_Dataset(train_data, augment_prob=0.2,
#                               alphas=alphas, augment_rate=7)
# dev_dataset = Stitchup_Dataset(dev_data, augment_prob=0,
#                               alphas=alphas, augment_rate=7)
# test_dataset = Stitchup_Dataset(test_data, augment_prob=0,
#                               alphas=alphas, augment_rate=7)

train_for_test_loader = DataLoader(train_ds_for_test, 16, num_workers=0) 
train_loader = DataLoader(train_dataset, 16, num_workers=0)
dev_loader = DataLoader(dev_dataset, 16, num_workers=0)
test_loader = DataLoader(test_dataset, 16, num_workers=0)

In [None]:
# in order to reset the model
net.reset()

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)

In [None]:
writer = SummaryWriter('/content/drive/MyDrive/UG_Project/runs/imdb_noaug_18')

In [None]:
dataiter = iter(train_loader)
X, y = dataiter.next()
X = X.to(device)
y = y.to(device)

In [None]:
writer.add_graph(net, X)
writer.close()

In [None]:
running_loss = 0.0
epochs = 100
best_epoch = 0

for epoch in range(epochs):
  print("#"*100)
  print("epoch", epoch, ":")
  for i, data in enumerate(train_loader, 0):

    inputs, labels = data[0].to(device), data[1].reshape(-1,).to(device).long()

    optimizer.zero_grad()

    outputs = net(inputs)

    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if i==6:
      cv_accuracy = get_accuracy(dev_loader, net)
      writer.add_scalar('dev accuracy', cv_accuracy,
                        epoch*len(train_loader)+i)
      writer.add_scalar('training loss', running_loss/7, 
                        epoch*len(train_loader)+i)
      
      print(f"running loss: {running_loss/7}")
      print(f"dev accuracy: {cv_accuracy}")
      best_epoch = max(best_epoch, cv_accuracy)
      running_loss = 0.0

print(f"Best epoch accuracy: {best_epoch}")
writer.close()

####################################################################################################
epoch 0 :
running loss: 0.7909621851784843
dev accuracy: 51.975903614457835
####################################################################################################
epoch 1 :
running loss: 0.6993134873253959
dev accuracy: 55.799196787148595
####################################################################################################
epoch 2 :
running loss: 0.6320540479251316
dev accuracy: 58.95983935742972
####################################################################################################
epoch 3 :
running loss: 0.6014797602381025
dev accuracy: 61.493975903614455
####################################################################################################
epoch 4 :
running loss: 0.5928954567228045
dev accuracy: 63.06425702811245
####################################################################################################
epoch 5 :
runnin

In [None]:
writer.add_scalar('best epoch cv accuracy', best_epoch)

In [None]:
train_acc = get_accuracy(train_for_test_loader, net)
writer.add_scalar('final train accuracy', train_acc)
train_acc

100.0

In [None]:
dev_acc = get_accuracy(dev_loader, net)
writer.add_scalar('final dev accuracy', dev_acc)
dev_acc

77.29718875502007

In [None]:
test_acc = get_accuracy(test_loader, net)
writer.add_scalar('final test accuracy', test_acc)
test_acc

77.384

In [None]:
len(train_loader)

7

# FixMatch

In [None]:
data_path = '/content/drive/MyDrive/UG_Project/imdb_processed_2/'
# Attributes of data on drive
partition_len = 1000
partition_cnt = 50

# FixMatch Hyperparamterers
lr=1e-3
landa = 1.5
thresh = 0.9
weak_s = 1
strong_s = 10
wd = 1e-3

In [None]:
class U_Dataset(torch.utils.data.IterableDataset):
  def __init__(self, path, partition_cnt, partition_len,
               weak_strength, strong_strength, augment_rate):
    self.path = path

    self.partition_len = partition_len  # Partition Size
    self.partition_cnt = partition_cnt  # Partion Count

    self.augment_rate = augment_rate + 1
    self.weak_s = weak_strength
    self.strong_s = strong_strength

    # format = [sentence, sentence embedding, aug_sentence_embeddings, 
    #           mean augmented embedding, U, S, V, label]
    self.sentences = None               # original sentences
    self.s_embeddings = None            # sentence embeddings
    self.a_s_embeddings = None          # Augmented sentence embeddings
    self.m_a_embeddings = None          # mean augmented embeddings
    self.Us = None                      # Us
    self.Ss = None                      # Ss
    self.Vs = None                      # Vs


  def augment(self, idx):
    m_a_embedding = self.m_a_embeddings[idx:idx+1,:]
   
    U = self.Us[idx,:,:]
    S = np.diag(self.Ss[idx,:])
    V = self.Vs[idx,:,:]
    
    latent_embeddings = np.dot(U, S)
    
    m_l_embedding = latent_embeddings.mean(axis=0, keepdims=True)
    l_embedding_std = latent_embeddings.std(axis=0, keepdims=True)

    
    weak_noise = np.random.randn(1, m_l_embedding.shape[1])*l_embedding_std*self.weak_s
    strong_noise = np.random.randn(1, m_l_embedding.shape[1])*l_embedding_std*self.strong_s
    weak_aug_l_embedding = m_l_embedding + weak_noise
    strong_aug_l_embedding = m_l_embedding + strong_noise
    
    ret1 = m_a_embedding + np.dot(weak_aug_l_embedding, V)
    ret2 = m_a_embedding + np.dot(strong_aug_l_embedding, V)
    return ret1.reshape(-1,).astype(np.float32), ret2.reshape(-1,).astype(np.float32)



  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    
    if worker_info is None:
      #if only one worker
      
      for partition in range(self.partition_cnt):
        # loading partition from file
        data_dic = load_unlabeled_data(self.path, partition)

        self.sentences = data_dic['sentences']                    # original sentences
        self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
        self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
        self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
        self.Us = data_dic['Us']                                  # Us
        self.Ss = data_dic['Ss']                                  # Ss
        self.Vs = data_dic['Vs']                                  # Vs

        for i in range(self.partition_len):
          yield self.augment(i)
    
    else:
      #multiple workers
      per_worker = int(math.ceil(self.partition_cnt / float(worker_info.num_workers)))
      worker_id = worker_info.id
      iter_start = worker_id * per_worker
      iter_end = min(iter_start + per_worker, self.partition_cnt)
      
      for partition in range(iter_start, iter_end):
        # loading partition from file
        data_dic = load_unlabeled_data(self.path, partition)

        self.sentences = data_dic['sentences']                    # original sentences
        self.s_embeddings = data_dic['s_embeddings']              # sentence embeddings
        self.a_s_embeddings = data_dic['a_s_embeddings']          # Augmented sentence embeddings
        self.m_a_embeddings = data_dic['m_a_embeddings']          # mean augmented embeddings
        self.Us = data_dic['Us']                                  # Us
        self.Ss = data_dic['Ss']                                  # Ss
        self.Vs = data_dic['Vs']                                  # Vs

        for i in range(self.partition_len):
          yield self.augment(i)
  

  def __len__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
          return self.data_len
        else:
          return int(math.ceil(self.data_len / float(worker_info.num_workers)))

In [None]:
train_data, dev_data = split_train_dev(all_train_data, 0.04)

In [None]:
torch.cuda.empty_cache()

### no augment and doubleA
train_ds_for_test = DoubleA_Dataset(train_data, augment_prob=0,
                          augment_strength=0, augment_rate=7)
u_dataset = U_Dataset(data_path, partition_cnt, partition_len,
                      weak_strength=weak_s, strong_strength=strong_s, augment_rate=7)
train_dataset = DoubleA_Dataset(train_data, augment_prob=0,
                          augment_strength=0, augment_rate=7)
dev_dataset = DoubleA_Dataset(dev_data, augment_prob=0,
                              augment_strength=0, augment_rate=7)
test_dataset = DoubleA_Dataset(test_data, augment_prob=0,
                              augment_strength=0, augment_rate=7)

train_for_test_loader = DataLoader(train_ds_for_test, 16, num_workers=0) 
train_loader = DataLoader(train_dataset, 10, num_workers=0)
u_loader = DataLoader(u_dataset, 500, num_workers=0)
dev_loader = DataLoader(dev_dataset, 16, num_workers=0)
test_loader = DataLoader(test_dataset, 16, num_workers=0)

In [None]:
# in order to reset the model
net.reset()

In [None]:
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
scheduler = ExponentialLR(optimizer, 0.9)

In [None]:
writer = SummaryWriter('/content/drive/MyDrive/UG_Project/runs/fixmach_test')

In [None]:
running_loss = 0.0
epochs = 30
best_epoch = 0
i_iter = 0

for epoch in range(epochs):
  print("#"*100)
  print("epoch", epoch, ":")

  labeled_iter = iter(train_loader)
  u_iter = iter(u_loader)

  i_iter = 0
  while True:
    try:
      x, y = next(labeled_iter)
      x1, x2 = next(u_iter)
    except StopIteration:
      break

    optimizer.zero_grad()

    # labeled part
    x,y = x.to(device), y.reshape(-1,).to(device).long()
    targets_labeled = net(x)
    # labeled loss
    L_labeled = F.cross_entropy(targets_labeled, y)

    # unlabeled part
    x1,x2 = x1.to(device), x2.to(device) 

    targets1 = net(x1)
    targets2 = net(x2)

    # print(F.softmax(targets2, 1).cpu().detach().numpy())
    # break
    values, pseudo_labels = torch.max(targets1,1)
    mask = values.ge(thresh).float()
    L_unlabeled = (F.cross_entropy(
        targets2, pseudo_labels, reduction='none')*mask).mean()


    loss = L_labeled + landa*L_unlabeled
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if i_iter==9:
      cv_accuracy = get_accuracy(dev_loader, net)
      writer.add_scalar('dev accuracy', cv_accuracy,
                        epoch*len(train_loader)+i_iter)
      writer.add_scalar('training loss', running_loss/10, 
                        epoch*len(train_loader)+i_iter)
      
      print(f"running loss: {running_loss/10}")
      print(f"dev accuracy: {cv_accuracy}")
      best_epoch = max(best_epoch, cv_accuracy)
      running_loss = 0.0
    i_iter += 1
  
  scheduler.step()

print(f"Best epoch accuracy: {best_epoch}")
writer.close()

####################################################################################################
epoch 0 :
running loss: 0.7617028474807739
dev accuracy: 63.95
####################################################################################################
epoch 1 :
running loss: 6.168008211255073
dev accuracy: 76.27916666666667
####################################################################################################
epoch 2 :
running loss: 4.67681587934494
dev accuracy: 80.78333333333333
####################################################################################################
epoch 3 :
running loss: 4.222019025683403
dev accuracy: 80.975
####################################################################################################
epoch 4 :
running loss: 4.040866807848215
dev accuracy: 80.05
####################################################################################################
epoch 5 :
running loss: 3.8505819723010064
dev accuracy: 82

In [None]:
writer.add_scalar('best epoch cv accuracy', best_epoch)

In [None]:
train_acc = get_accuracy(train_for_test_loader, net)
writer.add_scalar('final train accuracy', train_acc)
train_acc

99.6

In [None]:
dev_acc = get_accuracy(dev_loader, net)
writer.add_scalar('final dev accuracy', dev_acc)
dev_acc

83.3125

In [None]:
test_acc = get_accuracy(test_loader, net)
writer.add_scalar('final test accuracy', test_acc)
test_acc

83.392