In [1]:
import transformers
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sentence_transformers
import nltk
import datasets

In [2]:
samsum_val = pd.read_csv("/projects/anra7539/projects/representation_efficacy/SamSum_val.csv")
samsum_val['only_amr'] = samsum_val.amr.map(lambda x: "\n".join(x.split("\n")[1:]))

In [3]:
name = "microsoft/Phi-3-mini-128k-instruct"
device = "cuda"

model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                             device_map = device,
                                             cache_dir='/scratch/alpine/anra7539')

tokenizer = transformers.AutoTokenizer.from_pretrained(name, truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def summarization(dialogue, amr, prompt):
    with torch.no_grad():
        input_text = f'''{prompt}\n\n### Conversation:\n{dialogue}\n\n### AMR:\n{amr}\n\n### Summary:'''
        input_tokens = tokenizer(input_text, return_tensors = "pt", truncation = True, 
                                 max_length = 8192).to(device)
        outputs = model.generate(**input_tokens, 
                                 max_new_tokens=64, 
                                 temperature=0.1, 
                                 top_p=0.9, 
                                 repetition_penalty=1.1, 
                                 pad_token_id=tokenizer.eos_token_id, do_sample=True)        
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) 
        summary = full_response.split("### Summary:")[4].strip().split("\n")[0].strip()
        
        # summary = (
        #     full_response.split("### Summary:")[4]
        #     .split("\n")[1]
        #     .split("\n")[0]
        #     .strip()                 
        #     .rsplit(".", 1)[0] + "." 
        #     if "." in full_response.split("### Summary:")[4][:64] 
        #     else full_response.split("### Summary:")[4].split("\n")[1].split("\n")[0].strip()
        # )
    return summary

In [5]:
examples = samsum_val.loc[[142, 319, 252]]
example_text = "\n\n".join([f"### Conversation:\n{row['dialogue']}\n### AMR:\n{row['only_amr']}### Summary:\n{row['summary']}"
    for _, row in examples.iterrows()
])

In [6]:
prompt = f'''Summarize the following conversation in 1-2 sentences.
Ensure the summary captures the core intent, actions, and resolution.
Avoid examples, quotes, or minor details.
You may also use the provided Abstract Meaning Representation (AMR) structure of the conversation to your aid.

{example_text}'''

In [7]:
output_file = '/projects/anra7539/projects/representation_efficacy/samsum_amr_summarization_qphi3_3shot/generated_summaries.json'

if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        try:
            existing_data = [json.loads(line) for line in f]
        except json.JSONDecodeError:
            existing_data = []
else:
    existing_data = []

In [8]:
processed_indices = {item['index'] for item in existing_data}

In [9]:
len(processed_indices)

0

In [10]:
with open(output_file, 'a') as f:
    for i in tqdm(range(len(samsum_val))):
        if i in processed_indices:
            continue 

        summary = summarization(samsum_val.dialogue[i], samsum_val.only_amr[i], prompt)
        
        result = {
            "index": i,
            "dialogue": samsum_val.dialogue[i],
            "summary": samsum_val.summary[i],
            "prediction": summary
        }

        f.write(json.dumps(result) + "\n")
        f.flush()

  0%|          | 0/818 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.
  0%|          | 0/818 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.13 GiB. GPU 0 has a total capacity of 19.50 GiB of which 277.88 MiB is free. Process 1853209 has 16.93 GiB memory in use. Including non-PyTorch memory, this process has 19.18 GiB memory in use. Of the allocated memory 17.03 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Results

In [2]:
output_file = '/projects/anra7539/projects/representation_efficacy/samsum_amr_summarization_qphi3_3shot/generated_summaries.json'

with open(output_file, 'r') as f:
    data = [json.loads(line) for line in f]

summary_dataset = pd.DataFrame(data)

FileNotFoundError: [Errno 2] No such file or directory: '/projects/anra7539/projects/representation_efficacy/samsum_amr_summarization_qphi3_3shot/generated_summaries.json'

In [None]:
summary_dataset.shape

### Average F1-score

In [16]:
summary_dataset.head()

Unnamed: 0,index,dialogue,summary,prediction
0,0,"A: Hi Tom, are you busy tomorrow’s afternoon?\...",A will go to the animal shelter tomorrow to ge...,Tom agreed to go to the animal shelter with A ...
1,1,Emma: I’ve just fallen in love with this adven...,Emma and Rob love the advent calendar. Lauren ...,Emma wants to buy an Advent calendar for her k...
2,2,Jackie: Madison is pregnant\r\nJackie: but she...,Madison is pregnant but she doesn't want to ta...,Iggy thinks Madison might be worried about bei...
3,3,Marla: <file_photo>\r\nMarla: look what I foun...,Marla found a pair of boxers under her bed.,A mystery involving Marla's room and a pair of...
4,4,Robert: Hey give me the address of this music ...,Robert wants Fred to send him the address of t...,Fred gave Robert the address of the music shop...


In [17]:
def f1_score_strings(str1, str2):
    tokens1 = set(str1.lower().split())
    tokens2 = set(str2.lower().split())
    
    true_positives = len(tokens1 & tokens2)  
    false_positives = len(tokens1 - tokens2)  
    false_negatives = len(tokens2 - tokens1)  
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    
    return f1

In [18]:
summary_dataset['f1_scores'] = summary_dataset.apply(lambda x: f1_score_strings(x['summary'], x['prediction']), axis = 1)

In [19]:
np.mean(summary_dataset.f1_scores)

0.35257828612956377

## Cosine similarity

In [20]:
similarity_model = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')


def sent_similarity(str1, str2):
    embedding1 = similarity_model.encode(str1.lower())
    embedding2 = similarity_model.encode(str2.lower())
    
    return sentence_transformers.util.cos_sim(embedding1, embedding2)

In [21]:
summary_dataset['cosine_similarity'] = summary_dataset.apply(lambda x: sent_similarity(x['summary'], x['prediction']), axis = 1)

In [22]:
np.mean(summary_dataset.cosine_similarity)

0.7403522968292237

## ROUGE

In [20]:
from rouge_score import rouge_scorer

In [21]:
def compute_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text.lower(), generated_text.lower())
    return scores

In [22]:
summary_dataset['rouge_scores'] = summary_dataset.apply(lambda x: compute_rouge_scores(x['summary'], x['prediction']), axis = 1)
summary_dataset['rouge_1'] = summary_dataset.rouge_scores.map(lambda x: x['rouge1'].fmeasure)
summary_dataset['rouge_2'] = summary_dataset.rouge_scores.map(lambda x: x['rouge2'].fmeasure)
summary_dataset['rouge_l'] = summary_dataset.rouge_scores.map(lambda x: x['rougeL'].fmeasure)

In [23]:
print(f"ROUGE-1 score = {np.mean(summary_dataset.rouge_1)}")
print(f"ROUGE-2 score = {np.mean(summary_dataset.rouge_2)}")
print(f"ROUGE-L score = {np.mean(summary_dataset.rouge_l)}")

ROUGE-1 score = 0.2566198995545675
ROUGE-2 score = 0.0710760057483723
ROUGE-L score = 0.19505382890634784


## BLEU

In [24]:
from nltk.translate.bleu_score import sentence_bleu

In [25]:
summary_dataset['bleu_scores'] = summary_dataset.apply(lambda x: sentence_bleu([nltk.word_tokenize(x['summary'].lower())], 
                                                                                         nltk.word_tokenize(x['prediction'].lower())), axis = 1)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
print(np.mean(summary_dataset.bleu_scores))

0.0054886736670671675
