# Model Training and Evaluation Notebook

This notebook allows for training and evaluating the neutral summarization models. It includes steps for loading data, training the language model or summarizer, generating summaries, and evaluating performance using ROUGE scores.

## Prerequisites
- Ensure the environment is set up with required packages (see environment.yml or requirements.txt).
- Data should be preprocessed and available in data/processed/.
- Models will be saved in outputs/models/.

## Required Steps to Run the Notebook

1. **Update Conda**: Run the first code cell to update conda to the latest version.
2. **Create and Check Conda Environment**: Run the second code cell to create the conda environment (text-summarizer). **Note:** You must select this environment as your Kernel in Jupyter after creating it.
3. **Import Libraries**: Run the third code cell to import all necessary libraries and download NLTK data.
4. **Load Configurations**: Run the fourth cell to load dataset and hyperparameter configurations. Customize parameters as needed (e.g., batch size, epochs, mode).
5. **Build Dataloaders**: Run the fifth cell to build train, valid, and test dataloaders.
6. **Initialize Procedure**: Run the sixth cell to set up the training procedure with TensorBoard if enabled.
7. **Train the Model**: Run the seventh cell to train either the language model ('lm') or summarizer ('summ'). This may take time depending on epochs.
8. **Generate Summaries**: Run the eighth cell to generate summaries on the test set and save them to CSV.
9. **Evaluate**: Run the ninth cell to compute ROUGE scores against reference summaries.

Ensure the data is preprocessed before running. The environment.yml handles all installations, including pip dependencies.

In [None]:
# Update conda to the latest version (recommended to avoid installation issues)
# Uncomment and run the following line
# !conda update -n base -c defaults conda

print('Update conda if prompted, then proceed to create the environment.')

In [None]:
# Create Conda environment (text-summarizer)
# This will install all dependencies from environment.yml
# !conda env create -f ../environment.yml

# NOTE: You cannot activate a conda environment for the *current* notebook using '!conda activate'.
# You must restart the kernel and select 'text-summarizer' from the Jupyter Kernel dropdown menu.

import sys
import subprocess

env_name = 'text-summarizer'

# Check if environment exists
result_list = subprocess.run(['conda', 'env', 'list'], capture_output=True, text=True)
env_exists = env_name in result_list.stdout

# Check if currently running in the correct environment
current_env_path = sys.prefix
is_correct_env = env_name in current_env_path

if is_correct_env:
    print(f'SUCCESS: Currently running in the "{env_name}" environment.')
elif env_exists:
    print(f'WARNING: Environment "{env_name}" exists but is NOT active. Please switch the Jupyter Kernel to "{env_name}".')
else:
    print(f'ERROR: Environment "{env_name}" does not exist. Please run "!conda env create..." above.')

In [None]:
# Import necessary libraries
import pandas as pd
import time
import argparse
import re
import numpy as np
from rouge_score import rouge_scorer
import string
import nltk
from nltk.corpus import stopwords
import sys
sys.path.append('..')

# Personal libraries
from configs.config import DatasetConfig, HP
from data.DataLoader import build_dataloader
from utils.Errors import loss_estimation
from Procedures import Procedure
from model.lm import LanguageModel
from torch.utils.tensorboard import SummaryWriter
from configs.config import follow

# Download NLTK data if needed
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

In [None]:
# Load configurations
ds_config = DatasetConfig()
hp = HP()

# Customize parameters here
ds_config.batch_size = 32  # Adjust batch size
hp.lm_epochs = 10  # Number of epochs for LM training
hp.summarizer_epochs = 10  # Number of epochs for summarizer training

# Model options
mode = 'summ'  # 'lm' for language model, 'summ' for summarizer
model_name = 'my_model'  # Name for saving the model
lm_path = None  # Path to pretrained LM if training summarizer

# Override data paths to correct relative paths from notebooks/ directory
ds_config.train_data = '../data/processed/train.mini.csv'
ds_config.valid_data = '../data/processed/20221204_amazon_reviews_valid.csv'
ds_config.test_data = '../data/processed/20221204_amazon_reviews_test.csv'

print('Configurations loaded.')

In [None]:
# Build dataloaders
def build_dataloaders(ds_config, dataset="train", vocab=None):
    if dataset == "train":
        train_iter, vocab, _ = build_dataloader(
            file_path=ds_config.train_data, 
            vocab_size=ds_config.vocab_size,
            vocab_min_freq=ds_config.min_freq,
            vocab=None,
            is_train=True,
            shuffle_batch=True,
            max_num_reviews=ds_config.max_num_reviews,
            refs_path=None,
            max_len_rev=ds_config.max_len_rev,
            pin_memory=ds_config.pin_memory,
            num_workers=ds_config.workers,
            batch_size=ds_config.batch_size,
            preprocess=True,
            device=ds_config.device
        )
        return train_iter, vocab
    elif dataset == "valid":
        valid_iter, _, _ = build_dataloader(
            file_path=ds_config.valid_data, 
            vocab_size=ds_config.vocab_size,
            vocab_min_freq=ds_config.min_freq,
            vocab=vocab,
            is_train=False,
            shuffle_batch=False,
            max_num_reviews=ds_config.max_num_reviews,
            refs_path=None,
            max_len_rev=ds_config.max_len_rev,
            pin_memory=ds_config.pin_memory,
            num_workers=ds_config.workers,
            batch_size=ds_config.batch_size,
            preprocess=True,
            device=ds_config.device
        )
        return valid_iter
    elif dataset == "test":
        test_iter, _, test_references = build_dataloader(
            file_path=ds_config.test_data, 
            vocab_size=ds_config.vocab_size,
            vocab_min_freq=ds_config.min_freq,
            vocab=vocab,
            is_train=False,
            shuffle_batch=False,
            max_num_reviews=15,
            refs_path=None,
            max_len_rev=ds_config.max_len_rev,
            pin_memory=ds_config.pin_memory,
            num_workers=ds_config.workers,
            batch_size=ds_config.batch_size,
            preprocess=True,
            device=ds_config.device
        )
        return test_iter, test_references

# Load data
train_iter, vocab = build_dataloaders(ds_config, dataset="train")
valid_iter = build_dataloaders(ds_config, dataset="valid", vocab=vocab)
test_iter, test_references = build_dataloaders(ds_config, dataset="test", vocab=vocab)

print('Dataloaders built successfully.')

In [None]:
# Initialize TensorBoard writer (optional)
if follow["writer"]:
    comment = follow["Name"]
    writer = SummaryWriter(comment=comment)
else:
    writer = None

# Initialize procedure
# FIXED: Changed 'train_ter' to 'train_iter'
procedure = Procedure(hp, ds_config, vocab, writer=writer, train_iter=train_iter, valid_iter=valid_iter)

print('Procedure initialized.')

In [None]:
# Train the model
if mode == 'lm':
    print('Training Language Model...')
    procedure.train_lm(model_name=model_name, tolerance=3, check_every=5)
    print('Language Model training completed.')
elif mode == 'summ':
    print('Training Summarizer...')
    procedure.train_summarizer(model_name=model_name, lm_path=lm_path, tolerance=3, check_every=5)
    print('Summarizer training completed.')
else:
    print('Invalid mode. Choose lm or summ.')

In [None]:
# Generate summaries
print('Generating summaries...')
summaries = procedure.generate_summaries(itr=test_iter, model_name=model_name)

# Save summaries
def save_summaries(summaries, output_path, model_name):
    df = []
    for e in summaries:
        prod_id = e[0]
        summary = e[1][0]
        df.append({
            "model_path": f"outputs/models/summ/{model_name}",
            "model_name": model_name,
            "prod_id": prod_id, 
            "summary": summary
        })
    df = pd.DataFrame(df)
    df.to_csv(output_path, index=False)
    return df

output_path = f'./outputs/summaries/{model_name}_summaries.csv'
summary_df = save_summaries(summaries, output_path, model_name)
print(f'Summaries saved to {output_path}')

In [None]:
# Evaluation with ROUGE scores
def tokenize(sentence):
    INVALID_POS = ["CC", "CD", "DT", "EX", "IN", "LS", "PDT", "POS", "PRP", "PRP$", "RP", "TO", "WDT", "WP", "WRB"]
    # Fixed regex string formatting
    sentence = re.sub(f"[{re.escape(string.punctuation)}\\\â€¦]+", " ", sentence)
    tokens = nltk.pos_tag(sentence.split())
    tokens = [tok for (tok, pos) in tokens if tok.lower() not in stop_words and pos not in INVALID_POS]
    return tokens

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
refs = pd.read_csv("./eval/reference_summaries.csv")

def get_scores(path, refs):
    df = pd.read_csv(path)
    df = pd.merge(df, refs, how="inner", on="prod_id")
    df["summary"] = df["summary"].replace(np.nan, "")
    
    all_scores = []
    for prod_id in df["prod_id"].unique():
        data = df.loc[df["prod_id"] == prod_id]
        gen_summ = data["summary"].values[0]
        ref_summ = [data["summ_1"].values[0], data["summ_2"].values[0], data["summ_3"].values[0]]
        
        prod_scores = []
        for rs in ref_summ:
            rs = " ".join(tokenize(rs))
            gen_summ_ = " ".join(tokenize(gen_summ))
            scores = scorer.score(gen_summ_, rs)
            prod_scores.append([
                [scores["rouge1"].precision, scores["rouge1"].recall, scores["rouge1"].fmeasure],
                [scores["rouge2"].precision, scores["rouge2"].recall, scores["rouge2"].fmeasure],
                [scores["rougeL"].precision, scores["rougeL"].recall, scores["rougeL"].fmeasure]
            ])
        
        prod_score_arr = np.array(prod_scores)
        all_scores.append(prod_score_arr.mean(0))
    
    all_scores = np.array(all_scores)
    mean_scores = all_scores.mean(0)
    
    output = {
        "rouge1": {
            "precision": round(mean_scores[0][0], 5),
            "recall": round(mean_scores[0][1], 5),
            "fscore": round(mean_scores[0][2], 5)
        },
        "rouge2": {
            "precision": round(mean_scores[1][0], 5),
            "recall": round(mean_scores[1][1], 5),
            "fscore": round(mean_scores[1][2], 5)
        },
        "rougeL": {
            "precision": round(mean_scores[2][0], 5),
            "recall": round(mean_scores[2][1], 5),
            "fscore": round(mean_scores[2][2], 5)
        }
    }
    
    return output

# Compute scores
scores = get_scores(output_path, refs)
print('ROUGE Scores:', scores)