# Dependencies

In [11]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import random
import gc
from pathlib import Path
from sentence_transformers import SentenceTransformer, util


# Constants

In [12]:
SEED = 2021 
BINS = [float('inf'), 1.5, 1, 0.5, 0, -0.5, -1, -1.5, -2, -2.5, -3, -3.5, float('-inf')] # map the raw score to readability level from 1 to 12(easy to hard)
MAX_LENGTH = 256 # the maximum length of the texts feed to the model
CORPORA_LIST = ['simplewiki','wiki','bookcorpus']

TRAIN_FILE_ORIG=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train.csv")
TRAIN_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train_split.csv")
VAL_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","val_split.csv")
TRAIN_FILE_EXTENDED=os.path.join(Path(os.getcwd()).parent,'data','training','extended','train_augmented.csv')

BASELINE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","base")
PRETRAIN_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","pretrain")
FINETUNE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","finetune")
FINAL_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","final")

EMBEDDINGS_DIR=os.path.join(Path(os.getcwd()).parent,'data','embeddings')
EXTENDED_DATA_DIR=os.path.join(Path(os.getcwd()).parent,'data','training','extended')

# Utiliy Functions

In [14]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=2021)

In [15]:
def search_similar_passages(original_texts, external_texts_list, top_k):
  model = SentenceTransformer('paraphrase-TinyBERT-L6-v2')
  external_embeddings = []
  sentences = []
  for dataset in external_texts_list:
    embeddings_dir = os.path.join(EMBEDDINGS_DIR, 'encoded-'+ dataset + '.pt')
    if os.path.isfile(embeddings_dir):
      encoded = torch.load(embeddings_dir)
      external_embeddings.extend(encoded)
    else:
      raise FileNotFoundError(f'{dataset} embeddings could not be found.')
    texts_dir = os.path.join(EXTENDED_DATA_DIR,dataset+'.csv')
    if os.path.isfile(texts_dir):
      sents = pd.read_csv(texts_dir)
      sents = sents.text.values
      sentences.extend(sents)
    else:
      raise FileNotFoundError(f'{dataset} texts could not be found.')
    assert len(external_embeddings) == len(sentences)
  original_embeddings = model.encode(original_texts, convert_to_tensor=True)
  #for each original embedding, find a list of top_k entries(each entry is a pair of corpus_id(index in external_embeddings in our case) and similar score) sorted in descending similarity score.
  #We call one list of this kind is a hit. If there are A original-embeddings, then the "hits" below consists of A hits. 
  #totally, A*top_K similar embeddings will be returned after the function "util.semantic_search" executed.
  hits = util.semantic_search(original_embeddings, external_embeddings, top_k=top_k, corpus_chunk_size=80000)
  selected = []
  #retrive the passages from the list of 'sentences' based on the corpus_id from hits
  for hit in hits:
    sents = [sentences[h['corpus_id']] for h in hit]
    selected.append(sents)
  return selected


In [16]:
#zip the sentence, score(outcome of baseline model) and standard deviation 
def zip_hits_scores(hits, scores, stdev):
  zipped = []
  for idx, hit in enumerate(hits):
    current = [(h, scores[idx], stdev[idx]) for h in hit]
    zipped.extend(current)
  return zipped

In [17]:
#filter out those texts with score(outcome of baseline model）out of range of one std centered with it's corresponding original text's ground truth target 
def filter_on_stdev(sentences, predictions, targets, stdev):
  pred_filtered = []
  sents_filtered = []
  for idx, pred in enumerate(predictions):
    if abs(pred-targets[idx]) < stdev[idx]:
      pred_filtered.append(pred)
      sents_filtered.append(sentences[idx])  
  return sents_filtered, pred_filtered

In [18]:
#split the date into chunksize batch
def chunks(lst, chunksize):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), chunksize):
        yield lst[i:i + chunksize]

In [19]:
#make prediction 
def predict(model_name_or_dir, data):
  device = "cuda:0"
  config = AutoConfig.from_pretrained(model_name_or_dir, num_labels=1)
  tokenizer=AutoTokenizer.from_pretrained(config._name_or_path)
  model = AutoModelForSequenceClassification.from_pretrained(model_name_or_dir, config=config)
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    logits = outputs[0].detach().cpu().numpy().squeeze().tolist()    
    y_pred.extend(logits)
  del model
  return y_pred

In [20]:
#The main function to generate augmented dataset by searching similar texts in external dataset, and pseudo-labeling them with baseline model 
def generate_augmented_dataset(original_dir, external_dataset_list, model_dir, out_dir, top_k=10):
  torch.cuda.empty_cache()
  print("Load original dataset")
  original_train_df = pd.read_csv(original_dir)
  queries = [str(t) for t in original_train_df.excerpt.values]
  scores = [float(t) for t in original_train_df.target.values]
  stdev = [float(t) for t in original_train_df.standard_error.values]
  print('Start to search similar passages')
  #for each passage in the original corpus, select top_k similar passages in the external corpus
  hits = search_similar_passages(queries, external_dataset_list, top_k)
  zipped = zip_hits_scores(hits, scores, stdev)
  sentences = [t[0] for t in zipped]
  scores = [t[1] for t in zipped]
  stdev = [t[2] for t in zipped]
  torch.cuda.empty_cache()
  print('Predict target of the selected passages')
  predictions = predict(model_dir, sentences)
  print('Remove passages which predicted target is outside of the stdev(sandard deviation)')
  sents_filtered, preds_filtered = filter_on_stdev(sentences, predictions, scores, stdev)
  augmented_df = pd.DataFrame.from_dict({'excerpt': sents_filtered, 'target': preds_filtered})
  augmented_df.to_csv(out_dir)
  print(f'Selected passages are saved')
  torch.cuda.empty_cache()

# Pseudo label the external texts with baseline model to generate augmented training dataset

In [21]:
generate_augmented_dataset(TRAIN_FILE_ORIG, CORPORA_LIST,BASELINE_MODEL_DIR, TRAIN_FILE_EXTENDED)

Load original dataset
Start to search similar passages
Predict target of the selected passages


886it [04:51,  3.04it/s]


Remove passages which predicted target is outside of the stdev(sandard deviation)
Selected passages are saved


In [22]:
augmented_df=pd.read_csv(TRAIN_FILE_EXTENDED)
augmented_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10097 entries, 0 to 10096
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  10097 non-null  int64  
 1   excerpt     10097 non-null  object 
 2   target      10097 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 236.8+ KB
