In [None]:
 from google.colab import drive


drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install transformers

In [None]:
#@title Setup & Config
import transformers
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
#import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
model_name_or_path = "albert-xxlarge-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
mask_sentence = f"""Photogallery - 'Dragon Ball Super' Goku can [MASK] Super Saiyan Blue Ultra-Instinct"""

mask_input = tokenizer.encode(mask_sentence, return_tensors="pt").to(device)
cands=[]
logits = model(mask_input)[0].squeeze().detach()
is_masked = torch.where(mask_input == tokenizer.mask_token_id, 1, 0)
masked_idxs = torch.nonzero(is_masked)
probs= torch.softmax(logits[masked_idxs[:,1]], dim=1)

top_vocab_idxes = torch.topk(probs, 200)
for token_id in torch.transpose(top_vocab_idxes[1], 1, 0):
    cands.append(tokenizer.decode(token_id))
    print(token_id, "->", tokenizer.decode(token_id))
  

In [None]:
from tqdm import tqdm
import re
import pandas as pd

In [None]:
df=pd.read_csv('')

In [None]:
df['metaphorical_cands']

In [None]:
cands=[]
for index,row in tqdm(df.iterrows()):
  cands2=[]
  mask_sentence = row['Text']
  print(row['Text'])
  #svos=row['svo']
  verb=row['ROOT']
  verb=re.sub('[^A-Za-z0-9]+', '', verb)
  insensitive_rep = re.compile(re.escape(verb), re.IGNORECASE)
  mask_sentence=insensitive_rep.sub("[MASK]", mask_sentence)
  #mask_sentence=mask_sentence.replace(verb,'[MASK]')
  mask_input = tokenizer.encode(mask_sentence, return_tensors="pt").to(device)
  logits = model(mask_input)[0].squeeze().detach()
  is_masked = torch.where(mask_input == tokenizer.mask_token_id, 1, 0)
  masked_idxs = torch.nonzero(is_masked)
  probs= torch.softmax(logits[masked_idxs[:,1]], dim=1)
  top_vocab_idxes = torch.topk(probs, 200)
  for token_id in torch.transpose(top_vocab_idxes[1], 1, 0):
      cands2.append(tokenizer.decode(token_id))
      if index == 2:
        print(token_id, "->", tokenizer.decode(token_id))
  cands.append(cands2)




In [None]:
cands=[]
for index,row in tqdm(df.iterrows()):
  cands2=[]
  mask_sentence = row['Text']
  print(row['Text'])
  mask_input = tokenizer.encode(mask_sentence, return_tensors="pt").to(device)
  logits = model(mask_input)[0].squeeze().detach()
  is_masked = torch.where(mask_input == tokenizer.mask_token_id, 1, 0)
  masked_idxs = torch.nonzero(is_masked)
  probs= torch.softmax(logits[masked_idxs[:,1]], dim=1)
  top_vocab_idxes = torch.topk(probs, 200)
  for token_id in torch.transpose(top_vocab_idxes[1], 1, 0):
      cands2.append(tokenizer.decode(token_id))
      if index == 1:
       # print(token_id)
        print(mask_sentence)
        print(token_id, "->", tokenizer.decode(token_id))
  cands.append(cands2)




In [None]:
cands[0:5]

In [None]:
df['candidates']=cands

In [None]:
df.to_csv('',index=False)

In [None]:
df

In [None]:
############################
!pip install transformers

In [None]:
df=pd.read_csv('')

In [None]:
df

In [None]:
classNames = ['literal' ,'metaphorical']
PRE_TRAINED_MODEL_NAME = 'roberta-base'
tokenizer2 = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
class MetaphorDataset(Dataset):

  def __init__(self, tweet_text, targets, tokenizer2, max_len):
    self.tweet_text = tweet_text
    self.targets = targets
    self.tokenizer2 = tokenizer2
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweet_text)
  
  def __getitem__(self, item):
    tweet = str(self.tweet_text[item])
    target = self.targets[item]

    encoding = self.tokenizer2.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )

    return {
      'tweet_text': tweet,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
class MetaphorClassifier(nn.Module):

  def __init__(self, n_classes):
    super(MetaphorClassifier, self).__init__()
    self.roberta = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.roberta.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.roberta(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model2 = MetaphorClassifier(len(classNames))
model2 = model2.to(device)


In [None]:
state_dict = torch.load('best_model_state_v5.pth')
model2.load_state_dict(state_dict)
model2=model2.to(device)

In [None]:
class_names = ['literal' ,'metaphorical']
BATCH_SIZE = 16
MAX_LEN = 80
EPOCHS = 7

In [None]:
from tqdm import tqdm
import re

In [None]:
df["candidates"] = df["candidates"].apply(eval)

In [None]:
literal_cands=[]
count_l=[]
count_m=[]
metaphorical_cands=[]
for index,row in tqdm(df.iterrows()):
  literal=[]
  metaphorical=[]
  tweet_text2 = row['Text']
  #svos=row['svo']
  verb=row['ROOT']
  verb=re.sub('[^A-Za-z0-9]+', '', verb)
  insensitive_rep = re.compile(re.escape(verb), re.IGNORECASE)
  tweet_text2=insensitive_rep.sub("<mask>", tweet_text2)
  cands=row['candidates']
  i=0
  j=0

  for word in cands:
    tweet_text=tweet_text2.replace('<mask>',word)
    encoded_review = tokenizer2.encode_plus(
      tweet_text,
      max_length=MAX_LEN,
      add_special_tokens=True,
      return_token_type_ids=False,
      truncation=True,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    model2.eval()

    model2.zero_grad()

    output = model2(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    if class_names[prediction] =='literal':
      i+=1
      literal.append(word)
      
    elif class_names[prediction]=='metaphorical':
      metaphorical.append(word)
      
      j+=1
      if i>=25 and j>=25:
        break
  count_l.append(i)
  count_m.append(j)
  literal_cands.append(literal)
  metaphorical_cands.append(metaphorical)
     

  
  
     
    

In [None]:
df2=df


In [None]:
df2['literal_cands']=literal_cands

In [None]:
df2['metaphorical_cands']=metaphorical_cands

In [None]:
df2['count_l']=count_l

In [None]:
df2['count_m']=count_m

In [None]:
df2.to_csv('corrected_cands_meta.csv',index=False)

In [None]:
metaphorical_cands

In [None]:
df2.to_csv('',index=False)

In [None]:
df2['literal_cands']

In [None]:
df['Text'][1302]

In [None]:
cands2.append(cands)