In [None]:
# read the json file
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [None]:
import torch
from transformers import BertTokenizer, BertModel,BertForMaskedLM

def multiple_mask_tokens(input_text, n = 5):
    """
    :param input_text: string with MASK tokens
    :param n: the top number of tokens to return
    :return: list of n tokens for every mask token. Returns a blank list if no mask token is found
    """
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(input_text, return_tensors='pt')
    outputs = model(**inputs)

    # predicitons is the probability distribution over the vocabulary for each token
    predictions = outputs[0]

    # get index of masked tokens
    masked_indices = (inputs['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)

    if masked_indices[0].shape[0] == 0:
        print("No masked tokens found")
        return []

    # get the probability distribution over the vocabulary for each masked token
    masked_predictions = predictions[masked_indices]

    # get the top 5 predictions for each masked token
    top_n_values = torch.topk(masked_predictions, n, dim=1,sorted=True)
    top_n_probability = top_n_values.values
    top_n_token_numbers = top_n_values.indices

    # get the token words for the top n predictions
    answers = []
    for i in range(len(masked_indices[0]-1)):
        proabilities = []
        top_n_tokens = tokenizer.convert_ids_to_tokens(top_n_token_numbers[i])
        for j in range(len(top_n_tokens)):
            proabilities.append((top_n_tokens[j],top_n_probability[i][j].item()))
        answers.append(proabilities)
    return answers

print("Masked tokens in the sentence are:", multiple_mask_tokens("[MASK] [MASK] [MASK] of the US is public service"))

In [None]:
print("Masked tokens in the sentence are:", multiple_mask_tokens(template,n=5))

In [None]:
# print the top 5 sentences by replacing MASK with the top 5 tokens
masks = multiple_mask_tokens(template,n=5)

In [None]:
temp = template
for i in range(len(masks)):
    # replace the first [MASK] with the first token
    temp = temp.replace('[MASK]', masks[i][0][0], 1)
print(temp)

In [None]:
# classify adjectives as positive or negative to have. From a list of adjectives
# load the list of adjectives
import os

with open(os.path.join('./dict', 'adj.txt'), 'r') as f:
    adjectives = f.read().splitlines()

In [None]:
# convert adjectives to a score of positive or negative
import nltk
nltk.download('wordnet')
nltk.download('sentiwordnet')

from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

def get_sentiment(word):
    """
    :param word: a word
    :return: a score between -1 and 1
    """
    # get the synsets
    synsets = wn.synsets(word)
    if len(synsets) == 0:
        return 0
    else:
        synset = synsets[0]
        # get the sentiment score
        swn_synset = swn.senti_synset(synset.name())
        return swn_synset.pos_score() - swn_synset.neg_score()
    
# get the sentiment score for each adjective
#dictionary of adjectives and their sentiment score
adj_sentiment = {}
for adj in adjectives:
    adj_sentiment[adj] = get_sentiment(adj)

In [None]:
# classify adjectives as positive or negative to have.
temp =[ "People from [TTT] are [AAAs].",
"A person from [TTT] is a [AAA].",
"This is a [AAA] from [TTT].",
"These are [AAAs] from [TTT].",
"A [AAA] is from [TTT].",
"[AAAs] are from [TTT].",
"People came from [TTT] are [AAAs].",
"A person came from [TTT] is a [AAA].",
"People who are from [TTT] are [AAAs].",
"A person who is from [TTT] is a [AAA].",]

# load all the country names from country list
from country_list import countries_for_language
countries = countries_for_language('en')
countries = [country[1] for country in countries]


# get the top 5 adjectives for each country

# use unmasking to get the top 5 adjectives for each country
def get_top_adjectives(template, country):
    """
    :param country: a country name
    :return: a list of top 5 adjectives for the country
    """
    # get the top 5 adjectives for the country
    template = template.replace('[TTT]', country)
    tem = template.replace('[AAA]', '[MASK]')
    masks = multiple_mask_tokens(template, n=1)
    adjectives = []
    for i in range(len(masks)):
        adjectives.append(masks[i][0][0])
    return adjectives

get_top_adjectives(temp[0], 'Canada')