In [1]:
import numpy as np
import pandas as pd

In [9]:
labels_data = np.load('../npy_files/13Jun2023_without_multitask/test_labels_pious-flower-245.npy', allow_pickle=True)
preds_data = np.load('../npy_files/13Jun2023_without_multitask/test_preds_pious-flower-245.npy', allow_pickle=True)
model2 = np.load('../npy_files/27Feb/test_preds_comic-star-62.npy', allow_pickle=True)

In [10]:
labels = []
preds = []
for i in range(len(labels_data)):
    for j in range(len(labels_data[i][0])):
        labels.append(labels_data[i][0][j][0])
        preds.append(preds_data[i][0][j][0])
labels = np.array(labels)
preds = np.array(preds)
preds[preds >= 0.5] = 1
preds[preds < 0.5] = 0
print(labels.shape, preds.shape)
mistakes_model1 = list(np.not_equal(labels, preds).nonzero()[0])
# mistakes_model2 = list(np.not_equal(labels, model2).nonzero()[0])

(472,) (472,)


In [4]:
mistakes_corrected_by_model2 = list(set(mistakes_model1) - set(mistakes_model2))
mistakes_corrected_by_model1 = list(set(mistakes_model2) - set(mistakes_model1))

In [6]:
test_df = pd.read_csv('../data/with_aug_ttv/test.csv')
meta_df = pd.read_csv('../data/extra_data_trans.csv')
meta_df['desc'] = meta_df['key_phrases_desc_bert']
meta_df['transcript_size_increase_to_copy_stuff_easily'] = meta_df['key_phrases_transcript_bert']
test_df = pd.merge(test_df, meta_df, how='left', on='url')
test_df.drop(['transcript', 'key_phrases_desc_long', 'key_phrases_transcript_long', 'key_phrases_desc_bert', 'key_phrases_transcript_bert'], axis=1, inplace=True)
other_comments_data = pd.read_csv('../data/extra_data_other_comments.csv')
test_df = pd.merge(test_df, other_comments_data, how='left', on=['url', 'comment'])

In [7]:
pd.options.display.max_colwidth = 5000

In [14]:
(472 - len(mistakes_model1)) / 472 * 100

78.60169491525424

In [7]:
mistakes_df = test_df.iloc[mistakes_model1]
mistakes_df.to_csv('mistakes_best_model.csv', index=False)

In [None]:
mistakes_df.to_csv('./mistakes_not_corrected_by_video.csv')

In [8]:
import torch
from torch import nn
from transformers import LongformerModel, LongformerTokenizer
from transformers import BertTokenizer, BertModel

class LFEmbeddingModule(nn.Module):
    def __init__(self, args, device):
        super(LFEmbeddingModule, self).__init__()
        self.args = args
        if 'longformer' in self.args['model']:
            self.lf_model = LongformerModel.from_pretrained(self.args['model'], output_hidden_states=True).to(device)
            self.lf_tokenizer = LongformerTokenizer.from_pretrained(self.args['model'])
        else:
            self.lf_model = BertModel.from_pretrained(self.args['model'], output_hidden_states=True).to(device)
            self.lf_tokenizer = BertTokenizer.from_pretrained(self.args['model'])

        self.device = device
        modules = [self.lf_model.embeddings, *self.lf_model.encoder.layer[:self.args['freeze_lf_layers']]]
        for module in modules:
            for param in module.parameters():
                param.requires_grad = False
        
        
    def get_embeddings(self, comments, titles, descriptions, transcripts, other_comments):
        indexed_cs = []
        max_len_total = self.args['max_len']
        max_len_title = self.args['title_token_count']
        max_len_desc = self.args['desc_token_count']
        max_len_trans = self.args['transcript_token_count']
        max_len_other_comments = self.args['other_comments_token_count']
        padding = 'max_length' if self.args['pad_metadata'] else False
        for comment, title, desc, transcript, other_comment in zip(comments, titles, descriptions, transcripts, other_comments):
            enc_c = []
            if self.args['add_comment']:
                enc_c = self.lf_tokenizer.encode_plus(comment, max_length=max_len_total, padding=False, truncation=True)['input_ids']
            if self.args['add_title']:
                enc_t = self.lf_tokenizer.encode_plus(title, max_length=max_len_title, padding=padding, truncation=True)['input_ids']
                if len(enc_c) == 0:
                    enc_c.extend(enc_t)
                else:
                    enc_c.extend(enc_t[1:])
            if self.args['add_description']:
                enc_d = self.lf_tokenizer.encode_plus(desc, max_length=max_len_desc, padding=padding, truncation=True)['input_ids']
                if len(enc_c) == 0:
                    enc_c.extend(enc_d)
                else:
                    enc_c.extend(enc_d[1:])
            if self.args['add_transcription']:
                enc_tr = self.lf_tokenizer.encode_plus(transcript, max_length=max_len_trans, padding=padding, truncation=True)['input_ids']
                if len(enc_c) == 0:
                    enc_c.extend(enc_tr)
                else:
                    enc_c.extend(enc_tr[1:])
            if self.args['add_other_comments']:
                enc_oc = self.lf_tokenizer.encode_plus(other_comment, max_length=max_len_other_comments, padding=padding, truncation=True)['input_ids']
                if len(enc_c) == 0:
                    enc_c.extend(enc_oc)
                else:
                    enc_c.extend(enc_oc[1:])
            enc_c = enc_c[:max_len_total]
            enc_c.extend((max_len_total - len(enc_c))*[self.lf_tokenizer.pad_token_id])
            indexed_cs.append(enc_c)
        indexed_cs = torch.tensor(indexed_cs).to(self.device)
        # embedding = self.lf_model(indexed_cs)
        return indexed_cs


In [9]:
args = {
    'model': 'bert-large-cased',
    'add_comment': True,
    'max_len': 512,
    'add_title': False,
    'title_token_count': 40,
    'add_description': True,
    'desc_token_count': 80,
    'add_transcription': True,
    'transcript_token_count': 200,
    'add_other_comments': True,
    'other_comments_token_count': 512,
    'pad_metadata': True,
    'freeze_lf_layers': 23,
    'multilabel': False,
    'add_video': False,
}
device = torch.device('cpu')
lf_model1 = LFEmbeddingModule(args, device)
# comment_model1 = CommentModel(args).to(device)
criterion = nn.BCELoss().to(device)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
args['add_video'] = True
device = torch.device('cpu')
lf_model2 = LFEmbeddingModule(args, device)
criterion = nn.BCELoss().to(device)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import os
def load_weights(lf_model, device, run_name):
    lf_path = os.path.join(f'../models/lf_model_{run_name}.pth.tar')
    lf_checkpoint = torch.load(lf_path, map_location=device)
    lf_model.lf_model.load_state_dict(lf_checkpoint['state_dict'])
    return lf_model

In [12]:
lf_model1 = load_weights(lf_model1, device, 'floating-snake-10')
lf_model2 = load_weights(lf_model2, device, 'comic-star-62')

In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/chief-
[nltk_data]     blackhood/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
import numpy as np
latex_special_token = ["!@#$%^&*()"]

latex_special_token = ["!@#$%^&*()"]

def generate(text_list, attention_list, latex_file, color='red', rescale_value = False):
	assert(len(text_list) == len(attention_list))
	if rescale_value:
		attention_list = rescale(attention_list)
	attention_list = [x if x > 0.0001 else 0 for x in attention_list]
	word_num = len(text_list)
	text_list = clean_word(text_list)
	new_attention = []
	new_text = []
	prev = ""
	ind = 0
	while True:
		if ind >= len(attention_list):
			break
		cur_word = [text_list[ind]]
		attention_score = attention_list[ind] 
		while ind + 1 < len(attention_list) and text_list[ind + 1][0:4] == '\#\#':
			cur_word.append(text_list[ind + 1][4:])
			attention_score = max(attention_score, attention_list[ind + 1])
			ind += 1
		
		ind += 1
		new_attention.append(attention_score)
		new_text.append("".join(cur_word))

	attention_list = new_attention
	text_list = new_text

	new_attention = []
	new_text = []
	for i, _ in enumerate(attention_list):
		if(text_list[i] not in ['[CLS]', '[SEP]', '[PAD]']):
		#if attention_list[i] > 2: and text_list[i] not in STOPWORDS:
			new_attention.append(attention_list[i])
			new_text.append(text_list[i])
	
	attention_list = new_attention
	text_list = new_text
	word_num = len(text_list)
	with open(latex_file,'w') as f:
		f.write(r'''\documentclass[varwidth]{standalone}
\special{papersize=210mm,297mm}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
\begin{document}
\begin{CJK*}{UTF8}{gbsn}'''+'\n')
		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
		for idx in range(word_num):
			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"}\n"
		string += "\n}}}"
		f.write(string+'\n')
		f.write(r'''\end{CJK*}
\end{document}''')

def rescale(input_list):
	the_array = np.asarray(input_list)
	the_max = np.max(the_array)
	the_min = np.min(the_array)
	rescale = (the_array - the_min)/(the_max-the_min)*100
	return rescale.tolist()


def clean_word(word_list):
	new_word_list = []
	for word in word_list:
		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
			if latex_sensitive in word:
				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
		new_word_list.append(word)
	return new_word_list




In [59]:
print(test_df.iloc[1]['comment'])
print(test_df.iloc[1]['title'])
print(test_df.iloc[1]['desc'])
print(test_df.iloc[1]['transcript_size_increase_to_copy_stuff_easily'])
print(test_df.iloc[1]['key_phrases_other_comments'])

Add some thin sliced Salmon, maybe tuna or even beef and this would be wonderful.
Colorful Beautiful Vege Roll (Vegan)
ingredientsasparagusokraavocadooba japanese mint cucumberred beetsyellow beets kaiware sproutyamagobo zuke pickled burdock japanese leaf mint leaf kaiware pickled sproutyamagobo zuke pickled burdock cucumberred zuke japanese mint beetsyellow ingredientsasparagusokraavocadooba mint leaf ingredientsasparagusokraavocadooba japanese kaiware zuke pickled cucumberred leaf kaiware sproutyamagobo burdock beets sproutyamagobo pickled burdock cucumberred burdock cucumberred beetsyellow japanese mint leaf leaf kaiware sproutyamagobo zuke pickled burdock mint kaiware sproutyamagobo zuke cucumberred beetsyellow beetsyellow beets
nan
racist vote biden wallet youtube deleted ac transit better california hemorrhage footage barry zeal shown decades cohabitate violation share facebook help media doesn want help assassinating make joe unbelievable motherfucker earlobes chin new differenc

In [56]:
for id in mistakes_corrected_by_model2:
    label = test_df.iloc[id]['label']
    comments = [test_df.iloc[id]['comment']]
    titles =[test_df.iloc[id]['title']]
    descriptions = [test_df.iloc[id]['desc']]
    if type(test_df.iloc[id]['transcript_size_increase_to_copy_stuff_easily']) is str:
        transcripts = [test_df.iloc[id]['transcript_size_increase_to_copy_stuff_easily']]
    else:
        transcripts = [""]
    if type(test_df.iloc[id]['key_phrases_other_comments']) is str:
        other_comments = [test_df.iloc[id]['key_phrases_other_comments']]
    else:
        other_comments = [""]
    # if type(other_comments[0]) is not str and math.isnan(other_comments[0]):
    #     other_comments = [""]
    
    input_ids = lf_model1.get_embeddings(comments, titles, descriptions, transcripts, other_comments)
    attention = lf_model1.lf_model(input_ids)[-1]
    attention = attention[-1].squeeze().sum(axis=1).tolist()
    input_id_list = input_ids[0].tolist() # Batch index 0
    words = lf_model1.lf_tokenizer.convert_ids_to_tokens(input_id_list) 

    color = 'red'
    # print(attention)
    generate(words, attention, f"./bert_attention/attention_without_vision_{id}_{label}.tex", color, rescale_value=False)

    input_ids = lf_model2.get_embeddings(comments, titles, descriptions, transcripts, other_comments)
    attention = lf_model2.lf_model(input_ids)[-1]
    attention = attention[-1].squeeze().sum(axis=1).tolist()
    input_id_list = input_ids[0].tolist() # Batch index 0
    words = lf_model2.lf_tokenizer.convert_ids_to_tokens(input_id_list) 

    color = 'red'
    generate(words, attention, f"./bert_attention/attention_with_vision_{id}_{label}.tex", color, rescale_value=False)

In [47]:
input_ids = lf_model1.get_embeddings(comments, titles, descriptions, transcripts, other_comments)

In [48]:
# input_ids = lf_model1.get_embeddings(comments, titles, descriptions, transcripts, other_comments)
# print(len(input_ids['hidden_states']))
# print(input_ids.shape)
attention = lf_model1.lf_model(input_ids)[-1]
attention = attention[-1].squeeze().sum(axis=1).tolist()
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = lf_model1.lf_tokenizer.convert_ids_to_tokens(input_id_list) 

words = tokens
word_num = len(words)

color = 'red'
# print(attention)
generate(words, attention, f"attention_without_vision_{id}.tex", color, rescale_value=False)

In [49]:
input_ids = lf_model2.get_embeddings(comments, titles, descriptions, transcripts, other_comments)
attention = lf_model2.lf_model(input_ids)[-1]
attention = attention[-1].squeeze().sum(axis=1).tolist()
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = lf_model2.lf_tokenizer.convert_ids_to_tokens(input_id_list) 

words = tokens
word_num = len(words)

color = 'red'
generate(words, attention, f"attention_with_vision_{id}.tex", color, rescale_value=False)