# Import Dependencies

In [1]:
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from torch import Tensor, device
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
from tqdm import tqdm
import gc
gc.enable()
import urllib.request
import gzip
import json
from scipy.stats import truncnorm
import os

# Constants

In [2]:
BINS = [float('inf'), 1.5, 1, 0.5, 0, -0.5, -1, -1.5, -2, -2.5, -3, -3.5, float('-inf')] # map the raw score to readability level from 1 to 12(easy to hard)
MAX_LENGTH = 256 # the maximum length of the texts feed to the model
CORPORA_LIST = ['simplewiki','wiki','bookcorpus']

TRAIN_FILE_ORIG=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train.csv")
TRAIN_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train_split.csv")
VAL_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","val_split.csv")
TRAIN_FILE_EXTENDED=os.path.join(Path(os.getcwd()).parent,'data','training','extended','train_augmented.csv')

BASELINE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","base")
PRETRAIN_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","pretrain")
FINETUNE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","finetune")
FINAL_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","final")

EMBEDDINGS_DIR=os.path.join(Path(os.getcwd()).parent,'data','embeddings')
EXTENDED_DATA_DIR=os.path.join(Path(os.getcwd()).parent,'data','training','extended')

# Functions

In [3]:
#The function ecoding the texts into embeddings 
def encode_and_save(sentences, out_dir, data_name, model_name='paraphrase-TinyBERT-L6-v2'):
  model = SentenceTransformer(model_name)
  encoded = model.encode(sentences, convert_to_tensor=True)
  encoded=encoded.detach().cpu()
  if not os.path.isdir(out_dir): 
    os.makedirs(out_dir)  
  out_file = os.path.join(out_dir, 'encoded-' + data_name + '.pt')
  with open(out_file, 'wb') as f:
    torch.save(encoded, f)
  print("The embeddings are saved.")
  del model
  torch.cuda.empty_cache()
  

In [4]:
#Split the long texts into chunks, the size of each chunk is a random number between the average length minus 4 times standard deviation of length 
#and the average length plus 4 times standard deviation of length. (The average length is 173, and std is 17 as we calculated in the EDA notebook) 
def chunktext(s):
    chunksize=int(truncnorm(-4, 4, loc=173, scale=17).rvs()) # gengerate the size of chunk
    words=s.split()
    chunks=[]
    for i in range(len(words)//chunksize+1):
         ch=" ".join(words[i*chunksize:i*chunksize+chunksize])
         chunks.append(ch)
    return chunks

## Download and prepare Wiki Texts

In [None]:
#Load the dataset and do some filtrating, cleaning and formating
wikitext_dataset = load_dataset('wikitext', 'wikitext-103-v1')
wikitext_train = wikitext_dataset['train']
wikitext_train=[text for text in wikitext_train['text'] if len(text.split())>100 and len(text.split())<200]
wikitext_train=[text.replace('\n',' ') for text in wikitext_train]
#Save the processed text as csv file
wikitext_train_df=pd.DataFrame(wikitext_train,columns=['text'])
wikitext_train_df.to_csv(os.path.join(EXTENDED_DATA_DIR,'wiki.csv'))
#Encode the text into embeddings and save them
encode_and_save(wikitext_train, EMBEDDINGS_DIR, 'wiki')
gc.collect()


## Download and prepare SimpleWiki Texts

In [6]:
simplewiki_savepath=os.path.join(Path(os.getcwd()).parent,"data","download")
if not os.path.isdir(simplewiki_savepath):
    os.makedirs(simplewiki_savepath)
simplewiki_file=simplewiki_savepath+'/simplewiki-2020-11-01.jsonl.gz'
#download the SimpleWiki data and unzip it
urllib.request.urlretrieve('https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/simplewiki-2020-11-01.jsonl.gz',simplewiki_file)
passages = []
with gzip.open(simplewiki_file, 'rt', encoding='utf8') as f:
    for line in f:
        data = json.loads(line.strip())
        passages.extend(data['paragraphs'])
#filter out snippets which are too short or too long 
passages = [p for p in passages if len(p.split()) < 200 and len(p.split())>100]
simplewiki_df=pd.DataFrame(passages,columns=['text'])
#save the processed texts as csv file
simplewiki_df.to_csv(os.path.join(EXTENDED_DATA_DIR,'simplewiki.csv'))
#Encode the text to embeddings and save them
encode_and_save(passages, EMBEDDINGS_DIR, 'simplewiki')
gc.collect()

The embeddings are saved.


0

## Download and prepare Bookcorpus dataset

In [None]:
#Load the dataset
books = load_dataset('bookcorpusopen')
books = books['train'].remove_columns('title')
books=pd.DataFrame(books['text'],columns=['text'])
bookcorpus_df=books.sample(frac=0.1).reset_index()
bookcorpus_df.drop(columns=['index'],inplace=True)
# split each book into chunks
bookcorpus_df['text']=bookcorpus_df['text'].map(chunktext) 
bookcorpus_df=bookcorpus_df.explode('text').reset_index()  
bookcorpus_df=bookcorpus_df[['text']]
#save the processed texts as csv file
bookcorpus_df.to_csv(os.path.join(EXTENDED_DATA_DIR,'bookcorpus.csv'))
#Encode the text to embeddings and save them
encode_and_save(bookcorpus_df['text'], EMBEDDINGS_DIR, 'bookcorpus')


## Take a look at a sample of extended data and corrsponding embeddings

In [10]:
simplewiki_df=pd.read_csv(os.path.join(EXTENDED_DATA_DIR,'simplewiki.csv'))
simplewiki_df['text'][5]

'Lion dances were probably newer. China has not had its own lions since the spread of people out of Africa into the rest of the world. The earliest lions in Chinese books were gifts to the Han emperor from Parthia and other people who lived along the Silk Road connecting Chinese and Roman businesses. There was lion dancing under the Tang and in Japan by the 8th century, but people still thought of it as a foreign dance used by Buddhists. Today, people talk about "Northern" and "Southern" kinds of lion dances. The special northern kind began under the Southern Song (12th–13th century). The special southern kind began in Guangdong later, maybe under the Ming (14th–17th century).'

In [9]:
#The tensor corresponding to the embeddings of texts above
embedding=torch.load(os.path.join(EMBEDDINGS_DIR,"encoded-simplewiki.pt"))[5]
print("the demension of embedding is",embedding.shape)

the demension of embedding is torch.Size([768])
