1. Install necessary packages - trasformers, torch, fasttext and numpy.

In [1]:
!pip install transformers
!pip install numpy
!pip install fasttext
!pip install nltk
!pip install torch

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313501 sha256=df491140c7078361cef415d81cc88949d474d219dd3c22c1d04b143fad363f70
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513fa6b79451473ceb7713017823c3
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

2. Import the necessary libraries.

In [2]:
import torch
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import spacy
import fasttext
import fasttext.util
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from numpy import dot

3. Load BERT and Fasttext

In [3]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
nlp = spacy.load('en_core_web_sm')
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz



4. Predict Masked Tokens (credits Bill Yuchen Lin)

In [4]:
def predict_masked_sent(sentence, top_k=5):
  text = "[CLS]" + sentence + "[SEP]"
  tokenized_text = tokenizer.tokenize(text)
  masked_index = tokenized_text.index("[MASK]")
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

  probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
  top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
  candidates = []

  for i, pred_index in enumerate(top_k_indices):
    predicted_token = tokenizer.convert_ids_to_tokens([pred_index])[0]
    token_weight = top_k_weights[i].detach().item()
    candidates.append(predicted_token)
  return candidates

In [5]:
#TODO: DELETE this line! It is only for testing.
print(predict_masked_sent("Paris is the [MASK] of France", 10))

['capital', 'center', 'city', 'heart', 'centre', 'birthplace', 'metropolis', 'capitol', 'prefecture', 'capitals']


5. Select a set of candidate simplifications.

In [6]:
BERTCandidates = 100

6. Cosine similarity function

In [7]:
import numpy as np
import math

def getSimilarity(word1, word2):
  try:
    wv1 = ft.get_word_vector(word1)
    wv2 = ft.get_word_vector(word2)
    return (np.dot(wv1, wv2)/math.sqrt(np.dot(wv1,wv1)*np.dot(wv2,wv2)))
  except KeyError as e:
    return 0

7. Sorting the candidates.

In [8]:
def sortCandidateList(candidateList):
	for i in range(0, len(candidateList)-1):
		for j in range(i+1, len(candidateList)):
			simI = candidateList[i][1]
			simJ = candidateList[j][1]
			if simI < simJ:
				temp = candidateList[i]
				candidateList[i] = candidateList[j]
				candidateList[j] = temp
	ret = []
	for candidate in candidateList:
		ret.append(candidate)
	return ret

8. Output format

9. PoS Tagging for verification of candidates.

In [9]:
def getPosTags(sentence):
	posTags = []
	tokenizedText = word_tokenize(sentence)
	taggedText = pos_tag(tokenizedText)
	for word, tag in taggedText:
		posTags.append(tag)
	return posTags

10. Finding out simplifications.

In [11]:
originalSentence = "A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help."
complexWord = "compulsory"
maskedSentence = originalSentence.replace(complexWord, "[MASK]")
candidates = predict_masked_sent(maskedSentence, BERTCandidates)
print("Number of candidates generated by BERT = ", len(candidates))
candidateList = []
threshold = 0.4 #Threshold to prune candidates.
for candidate in candidates:
  similarity = getSimilarity(complexWord, candidate)
  if similarity >= threshold:
    candidateList.append([candidate, similarity])
candidateList = sortCandidateList(candidateList)[1:]
print("Number of candidates after pruning = ", len(candidateList))
#print(len(candidateList))
originalPosTags = getPosTags(originalSentence)
maskedPosTags = getPosTags(maskedSentence)

Number of candidates generated by BERT =  100
Number of candidates after pruning =  4


In [12]:
def isListsEqual(list1, list2):
  for i in range(0, len(list1)):
    if list1[i] != list2[i]:
      return False
  return True

In [13]:
#Final candidates
for candidate in candidateList:
  print(candidate)
  generatedSentence = maskedSentence.replace("[MASK]", candidate[0])
  generatedPosTags = getPosTags(generatedSentence)
  if isListsEqual(originalPosTags, generatedPosTags):
    print(candidate[0])
    print(generatedSentence)

['mandatory', np.float32(0.76685005)]
mandatory
A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be mandatory for those receiving public help.
['obligatory', np.float32(0.59822756)]
obligatory
A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be obligatory for those receiving public help.
['optional', np.float32(0.47102866)]
optional
A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be optional for those receiving public help.
['required', np.float32(0.44816893)]


11. Main code.

In [14]:
def simplifySentence(originalSentence, complexWord):
  maskedSentence = originalSentence.replace(complexWord, "[MASK]")
  candidates = predict_masked_sent(maskedSentence, BERTCandidates)
  candidateList = []
  threshold = 0.4 #Threshold to prune candidates. You can make other thresholds based on the validation data.
  for candidate in candidates:
    similarity = getSimilarity(complexWord, candidate)
    if similarity >= threshold:
      candidateList.append([candidate, similarity])
  candidateList = sortCandidateList(candidateList)[1:]
  originalPosTags = getPosTags(originalSentence)
  maskedPosTags = getPosTags(maskedSentence)
  for candidate in candidateList:
    generatedSentence = maskedSentence.replace("[MASK]", candidate[0])
    generatedPosTags = getPosTags(generatedSentence)
    if isListsEqual(originalPosTags, generatedPosTags):
      simplifiedSentence = generatedSentence
      return (simplifiedSentence)
  return (originalSentence)

In [15]:
def getTextFromFile(filename):
  f = open(filename, "r")
  text = f.readlines()
  f.close()
  return text

In [17]:
fileName = "trial_data.tsv"
lines = getTextFromFile(fileName)
for line in lines:
  sentence = line.split("\t")[0]
  complexWord = line.split("\t")[1].replace("\n","")
  simplification = simplifySentence(sentence, complexWord)
  print(simplification)

FileNotFoundError: [Errno 2] No such file or directory: 'trial_data.tsv'