In [1]:
import transformers
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sentence_transformers
import nltk

In [2]:
torch.cuda.device_count()

1

## Obtaining reconstructions

In [3]:
with open('amr-release-3.0-amrs-bolt.txt', "r") as f:
  data = f.read()

In [4]:
text_amr_pairs = data.split("::snt")[1:]

In [5]:
texts = []
amrs = []

for i in tqdm(range(len(text_amr_pairs))):
    try:
        texts.append(text_amr_pairs[i].split("\n#")[0].strip())
        amrs.append(text_amr_pairs[i].split("\n#")[1].split(".txt\n")[1].strip())
    except:
        pass

100%|██████████| 1327/1327 [00:00<00:00, 431092.98it/s]


In [6]:
name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [7]:
device = "cuda"

model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                                          torch_dtype = torch.bfloat16,
                                             device_map = device,
                                             cache_dir='/scratch/alpine/anra7539')

tokenizer = transformers.AutoTokenizer.from_pretrained(name, padding_side="left", truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
def reconstruct_text(amr, prompt):
    with torch.no_grad():
        input_text = f'''{prompt}\nAMR:{amr}\nText:'''
        input_tokens = tokenizer(input_text, return_tensors = "pt", truncation = True, max_length = 4096).to(device)
    
        outputs = model.generate(**input_tokens, max_new_tokens = 200, pad_token_id = tokenizer.eos_token_id)
    
        answer = tokenizer.decode(outputs[0], skip_special_tokens = True).split("Text:")[11].split("\n")[0].strip()
    return answer

In [9]:
few_shot_examples = np.random.randint(1000, size = 5)

In [10]:
examples = "\n".join([f"AMR:{amrs[i]}\nText:{texts[i]}" for i in few_shot_examples])

prompt = f'''Based on the 5 examples below, return the original text of the given AMR:\n\n{examples}'''

In [17]:
reconstructed_texts = []
for i in tqdm(amrs[300:]):
    reconstructed_texts.append(reconstruct_text(i, prompt))

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
 44%|████▍     | 450/1027 [3:18:59<4:15:09, 26.53s/it]


IndexError: list index out of range

In [None]:
result_df = pd.DataFrame({"original_text":texts[300:], "reconstructed_text":reconstructed_texts})
result_df.to_csv('/projects/anra7539/projects/representation_efficacy/reconstructed_from_amrs_qllama3/reconstructed_texts_300+.csv', index = False)

## Reconstruction results

In [2]:
files = os.listdir('/projects/anra7539/projects/representation_efficacy/reconstructed_from_amrs_qllama3/')
files.remove('.ipynb_checkpoints')
data = []
for file in files:
    data.append(pd.read_csv('/projects/anra7539/projects/representation_efficacy/reconstructed_from_amrs_qllama3/'+file))

In [3]:
full_reconstructions = pd.concat(data, ignore_index = True)[['original_text', 'reconstructed_text']]

### Average F1-score

In [5]:
def f1_score_strings(str1, str2):
    tokens1 = set(str1.lower().split())
    tokens2 = set(str2.lower().split())
    
    true_positives = len(tokens1 & tokens2)  
    false_positives = len(tokens1 - tokens2)  
    false_negatives = len(tokens2 - tokens1)  
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    
    return f1

In [7]:
full_reconstructions['f1_scores'] = full_reconstructions.apply(lambda x: f1_score_strings(x['original_text'], x['reconstructed_text']), axis = 1)

In [8]:
np.mean(full_reconstructions.f1_scores)

0.41747350277633133

### Sentence Transformer embedding similarity

In [11]:
similarity_model = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')


def sent_similarity(str1, str2):
    embedding1 = similarity_model.encode(str1)
    embedding2 = similarity_model.encode(str2)
    
    return sentence_transformers.util.cos_sim(embedding1, embedding2)



In [12]:
full_reconstructions['cosine_similarity'] = full_reconstructions.apply(lambda x: sent_similarity(x['original_text'], x['reconstructed_text']), axis = 1)

In [13]:
np.mean(full_reconstructions.cosine_similarity)

0.7805601693493311

### ROUGE scores

In [4]:
from rouge_score import rouge_scorer

In [5]:
def compute_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    return scores

In [12]:
full_reconstructions['rouge_scores'] = full_reconstructions.apply(lambda x: compute_rouge_scores(x['original_text'], x['reconstructed_text']), axis = 1)
full_reconstructions['rouge_1'] = full_reconstructions.rouge_scores.map(lambda x: x['rouge1'].fmeasure)
full_reconstructions['rouge_2'] = full_reconstructions.rouge_scores.map(lambda x: x['rouge2'].fmeasure)
full_reconstructions['rouge_l'] = full_reconstructions.rouge_scores.map(lambda x: x['rougeL'].fmeasure)

In [14]:
print(f"ROUGE-1 score = {np.mean(full_reconstructions.rouge_1)}")
print(f"ROUGE-2 score = {np.mean(full_reconstructions.rouge_2)}")
print(f"ROUGE-L score = {np.mean(full_reconstructions.rouge_l)}")

ROUGE-1 score = 0.5850877843608724
ROUGE-2 score = 0.23063268408945545
ROUGE-L score = 0.42714075456098805


## BLEU

In [5]:
from nltk.translate.bleu_score import sentence_bleu

In [6]:
full_reconstructions['bleu_scores'] = full_reconstructions.apply(lambda x: sentence_bleu([nltk.word_tokenize(x['original_text'].lower())], 
                                                                                         nltk.word_tokenize(x['reconstructed_text'].lower())), axis = 1)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [7]:
print(np.mean(full_reconstructions.bleu_scores))

0.08681297802999965
