In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
import pickle
import pandas as pd
import sys
import os
src_dir = os.path.join('/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/', 'src')
sys.path.append(src_dir)
from text_generation import preprocess_ranking_results

# Prepare data for mT5

In [None]:
tfidf_results = preprocess_ranking_results.load_tfidf_results('/content/drive/MyDrive/AQA-Data/tfidf_results1000.pickle')
top_1, top_5, top_10 = preprocess_ranking_results.get_top_results(tfidf_results)
urls = preprocess_ranking_results.load_reference_urls('/content/reference_urls_collected.csv')
content_list = preprocess_ranking_results.get_content_list(top_1, urls)
questions = preprocess_ranking_results.load_questions('/content/drive/MyDrive/AQA-Data/questions.csv')
answer_list = preprocess_ranking_results.get_answer_list(tfidf_results, questions)
text_summary_df = preprocess_ranking_results.create_text_summary_df(content_list, answer_list)
text_summary = preprocess_ranking_results.convert_to_dataset(text_summary_df)

# Apply mT5 Part

In [None]:
text_summary = text_summary.train_test_split(test_size=0.2, seed=42)

# Pre-process

In [None]:
# Pre-process
from transformers import AutoTokenizer

checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_summaries = text_summary.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Evaluate

In [None]:
# Evaluate

import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


import numpy as np
from bert_score import score as bert_score
from rouge_score import rouge_scorer

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = scorer.score(decoded_preds, decoded_labels)

    # Calculate BERT scores
    P, R, F1 = bert_score(decoded_preds, decoded_labels, lang='en', verbose=False)

    # Calculate average prediction length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    avg_prediction_len = np.mean(prediction_lens)

    metrics = {
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'bert_precision': P.mean().item(),
        'bert_recall': R.mean().item(),
        'bert_f1': F1.mean().item(),
        'avg_prediction_len': avg_prediction_len
    }

    return metrics

# Train

In [None]:
# Train
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="qa_1000_samples",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_summaries["train"],
    eval_dataset=tokenized_summaries["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#trainer.train()
#trainer.push_to_hub()

# Inference

In [None]:
# Inference
from transformers import pipeline

summarizer = pipeline('summarization', model="natope/qa_1000_samples", device=0)
text_summary['test']['text']

summaries = []
for text in text_summary['test']['text']:
  summaries.append(summarizer(text, no_repeat_ngram_size=2, min_length=50, max_length=200))


gold_standard = text_summary['test']['summary'][:106]

generated_summaries = []
counter = 0
for i in range(len(summaries)):
  generated_summaries.append(summaries[counter][0]['summary_text'])
  counter+=1

# Calculate scores

In [None]:
# Calculate scores
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import f1_score

generated_summaries = generated_summaries  # List of generated summaries
reference_summaries = gold_standard  # List of reference summaries

# Convert non-string elements to strings
generated_summaries = [str(summary) for summary in generated_summaries]
reference_summaries = [str(summary) for summary in reference_summaries]

# Compute BERT scores
P, R, F1 = bert_score(generated_summaries, reference_summaries, lang='nl', verbose=False)

# Compute ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
rouge_scores = []

for generated_summary, reference_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(generated_summary, reference_summary)
    rouge_scores.append(scores)

# Compute BLEU score
bleu_score = corpus_bleu([[ref.split()] for ref in reference_summaries], [gen.split() for gen in generated_summaries])

# Compute F1 score
f1 = f1_score(reference_summaries, generated_summaries, average='micro')  # Adjust 'average' parameter as needed

# Access the scores as needed
print("BERT Score - Precision:", P.mean().item())
print("BERT Score - Recall:", R.mean().item())
print("BERT Score - F1:", F1.mean().item())

avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

print("ROUGE-1 (Average):", avg_rouge1)
print("ROUGE-2 (Average):", avg_rouge2)
print("ROUGE-L (Average):", avg_rougeL)

print("BLEU Score:", bleu_score)
print("F1 Score:", f1)