# Initialize Packages and Load Dataset

In [None]:
import warnings
warnings.filterwarnings('ignore')
import torch
torch.cuda.empty_cache()

In [None]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from bert_score import score as bert_score
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from random import sample
import random

In [None]:
from datasets import load_from_disk

dataset = load_from_disk('data/decomposed/decomposed_test')

# Check Test Dataset + Add Tokenizer Function

In [None]:
from typing import List, Dict

def tokenization_with_answer(examples, tokenizer):
    inputs = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, summary) in enumerate(zip(examples['query'], examples['table'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=512, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    return model_inputs


def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

In [None]:
def generate_predictions(examples, tokenizer, model):
    generated_texts = []
    for example in examples:
        
        # Intial tokenization
        input_text = f"query:  {example['query']} answer: {example['answers']} header: {' '.join(map(str, example['table'].get('header', [])))} rows: {' '.join(map(str, example['table'].get('rows', [])))} title: {' '.join(map(str, example['table'].get('title', [])))}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        
        # Generate text and decode
        output_sequences = model.generate(input_ids)
        generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        
        # Add to list of generated text
        generated_texts.append(generated_text)
    
    return generated_texts

In [None]:
dataset

# Reduce it for testing
random_indices = random.sample(range(len(dataset)), 20)
dataset = dataset.select(random_indices)
print(dataset)

## Trainer Creation Function

In [None]:
def createTrainer(model, tokenzier):
    data_collator = DataCollatorForSeq2Seq(tokenzier, model)
    
    # Not needed, but trainer requires it even if not used
    train_args = Seq2SeqTrainingArguments(
        output_dir="./train_weights_t5",
        learning_rate=3e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        evaluation_strategy="steps",
        save_strategy = "steps",
        eval_steps=200,
        save_steps=200,
        weight_decay=0.01,
        save_total_limit=5,
        warmup_ratio=0.05,
        load_best_model_at_end=True,
        predict_with_generate=True,
        overwrite_output_dir= True,
        gradient_accumulation_steps = 2
    )

    return Seq2SeqTrainer(
        model=model,
        args=train_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

# Load in models

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartTokenizer, BartForConditionalGeneration

# # gpt2
# tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")
# model_gpt2 = GPT2Model.from_pretrained("gpt2")

# t5 small
tokenizer_t5 = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

# flan t5
tokenizer_flant5 = T5Tokenizer.from_pretrained("google/flan-t5-small")
model_flant5 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# Bart
tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-base")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [None]:
models_and_tokenizers_with_names = [
    # ("GPT2", tokenizer_gpt2, model_gpt2),
    ("T5 Small", tokenizer_t5, model_t5),
    ("FLAN-T5 Small", tokenizer_flant5, model_flant5),
    ("BART Base", tokenizer_bart, model_bart)
]

# Make predictions using each Model on Test Data

In [None]:
model_predictions = {}

for name, tokenizer, model in models_and_tokenizers_with_names:
    print(f"Model: {name}")
    predictions = generate_predictions(dataset, tokenizer, model)
    model_predictions[name] = predictions

In [18]:
print(model_predictions)

{'T5 Small': ['Revelations', "'13', '13', '23', '1", 'Alan And Lia - Jay', "'Purdue', '52', '56',", 'True', "'2', '1', '0', '0'", 'Montgomery Renee Montgomery', "'13,837,259', '2,724,975 (", 'John Putch', 'True', 'True', 'True', '67, 67, 71, 75, 75, 75, 75, 75,', '121', "'22'] ['Kansas State Capitol', '5'", 'Miss New Hampshire', "'3', 'October 4, 1953', 'Philadelphia Eagle", "'40', '.512', '24–17',", '2015', "'1949', '30,866']"], 'FLAN-T5 Small': ['Revelations: The Revelations of Michael Hurst', '', 't a k e M e O u t ( U', 'a n f o r d', 't i s t i c s', 't e p h e n K i n g', 'o n e c t i c u t H', 'i n s u s u s e s', 'i s t o f U g l y B', 'o n f e r e a s o', 'w a t c h ( T v c h', 't a d i u m', 'a l C a n a d i e n', 'a t P u n j a b C r', '40 Corporate Woods', 's s N e w H a m p s', 'd u l e v e l a n', 's e a s o n s t a', 'o d e o C o n t e s', 'i s t o f u n i v'], 'BART Base': ['query:  Summarize the basic information of those episodes of Blood and Sand written', 'query:  S

### Choosing Best Answer

In [37]:
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np

def select_best_guess(models_and_tokenizers_with_names, dataset, model_predictions, weights=(0.5, 0.5)):
    weight_for_rouge, weight_for_bert = weights
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    best_guesses = []

    for i, example in enumerate(dataset):
        best_score = -np.inf
        best_guess_info = {}
        target_answer = example['summary']
        
        for name, _, _ in models_and_tokenizers_with_names:
            prediction = model_predictions[name][i]
            rouge_scores = scorer.score(target_answer, prediction)
            rouge_score_avg = np.mean([rouge_scores['rouge1'].fmeasure, rouge_scores['rougeL'].fmeasure])

            _, _, bert_scores = score([prediction], [target_answer], lang="en", verbose=False)
            bert_score = bert_scores.mean().item()

            # Calculate combined score based on specified weights
            combined_score = (weight_for_rouge * rouge_score_avg) + (weight_for_bert * bert_score)

            if combined_score > best_score:
                best_score = combined_score
                best_guess_info = {
                    'model': name,
                    'best_guess': prediction,
                    'query': target_answer
                }

        best_guesses.append(best_guess_info)
    
    return best_guesses

In [38]:
best_guesses = select_best_guess(models_and_tokenizers_with_names, dataset, model_predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [39]:
for guess in best_guesses:
    print(guess)

{'model': 'BART Base', 'best_guess': 'query:  Summarize the basic information of those episodes of Blood and Sand written', 'query': 'Brent Fletcher wrote for four episodes of Blood and Sand: Legends, Great and Unfortunate Things, Party Favors and Revelations. Directed by Grady Hall, Jesse Warn, Chris Martin - Jones and Michael Hurst respectively. Party Favors and Revelations aired on March 26th and April 9th of 2010 with Production Code numbers Sps110 and Sps112. They respectively achieved 1.27 million and 1.29 million views in the U.S. Great and Unfortunate Things also aired during March 2010 with production code Sps107 and 0.97 million views in the US while Legends was shown in February 2010 with Production Code Sps103, achieving a view count of 0.86million in the US.'}
{'model': 'BART Base', 'best_guess': 'query:  Summarize the basic information of the game(s) between Buffalo', 'query': 'The Buffalo Bills faced the New England Patriots twice in their 1973 season. The first game was