In [1]:
!ls

baseline_results	 test_data.parquet   trained_model
nlp_project.ipynb	 test.ipynb	     trained_model_t5-small
results			 test.py	     trained_tokenizer_t5-small
summary_predictions.csv  train_data.parquet  val_data.parquet


In [2]:
import pandas as pd

In [3]:
# load parquet files from drive
train_data = pd.read_parquet('train_data.parquet')
val_data = pd.read_parquet('val_data.parquet')
test_data = pd.read_parquet('test_data.parquet')

In [4]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first GPU

True
2
NVIDIA A100-PCIE-40GB


In [5]:
from datasets import Dataset

# Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Shuffle and select the same random rows each time
small_train_dataset = train_dataset.shuffle(seed=42).select(range(20000))
small_val_dataset = val_dataset.shuffle(seed=42).select(range(4000))
small_test_dataset = test_dataset.shuffle(seed=42).select(range(4000))

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load BART model and tokenizer
model_name = "facebook/bart-large-xsum"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)



In [8]:
from datasets import Dataset

def preprocess_data(batch):
    inputs = tokenizer(batch['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(batch['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs['labels'] = outputs['input_ids']
    return inputs

# Preprocess training data using Hugging Face Dataset map function
tokenized_train = small_train_dataset.map(preprocess_data, batched=True)
tokenized_val = small_val_dataset.map(preprocess_data, batched=True)
tokenized_test = small_test_dataset.map(preprocess_data, batched=True)

Map: 100%|██████████| 20000/20000 [00:38<00:00, 521.96 examples/s]
Map: 100%|██████████| 4000/4000 [00:08<00:00, 488.00 examples/s]
Map: 100%|██████████| 4000/4000 [00:08<00:00, 487.13 examples/s]


In [9]:
print(model.device)
model = model.to('cuda')
print(model.device)

cpu
cuda:0


In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results", # Specifies the directory where model checkpoints, logs, and outputs will be saved during training. Useful for resuming training later or for deployment.
    evaluation_strategy="epoch", #Indicates when evaluation should be performed. epoch: Evaluates at the end of every training epoch. steps: Evaluates every eval_steps (e.g., every 500 steps). no: Skips evaluation
    learning_rate=5e-5, # Sets the learning rate for the optimizer. 5e-5 is a common default for fine-tuning transformer models.
    per_device_train_batch_size=16, # The batch size for training on each device (e.g., per GPU or TPU core). If using 2 GPUs, the effective batch size becomes 2 x num_gpus.
    per_device_eval_batch_size=16, # The batch size for evaluation, handled similarly to training batch size.
    num_train_epochs=3, # The number of complete passes (epochs) through the entire training dataset.
    save_steps=20000, # Saves a checkpoint of the model every 10,000 steps. This is useful for resuming training after interruptions.
    save_total_limit=2, # Limits the number of saved checkpoints. The oldest checkpoints are deleted once the limit is reached.
    fp16=True,  # Enable mixed precision for faster training
    remove_unused_columns=True,
    gradient_accumulation_steps=2,  # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    dataloader_num_workers=32,  # Adjust based on available CPU cores
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,No log,0.374203
2,0.320000,0.448444




TrainOutput(global_step=936, training_loss=0.24481615245851696, metrics={'train_runtime': 3557.4509, 'train_samples_per_second': 16.866, 'train_steps_per_second': 0.263, 'total_flos': 1.2981823408491725e+17, 'train_loss': 0.24481615245851696, 'epoch': 2.9952})

In [12]:
  
# Save model and tokenizer
model.save_pretrained("./trained_model_bart_large")
tokenizer.save_pretrained("./trained_tokenizer_bart_large")

('./trained_tokenizer_bart_large/tokenizer_config.json',
 './trained_tokenizer_bart_large/special_tokens_map.json',
 './trained_tokenizer_bart_large/vocab.json',
 './trained_tokenizer_bart_large/merges.txt',
 './trained_tokenizer_bart_large/added_tokens.json')

In [13]:
from evaluate import load

metric = load("rouge")

def evaluate_summaries(model, tokenizer, data):
    summaries = []
    for sample in data:
        inputs = tokenizer(sample['document'], return_tensors="pt", truncation=True, max_length=1024)

        # Move inputs to the same device as the model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        output = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
        summaries.append(tokenizer.decode(output[0], skip_special_tokens=True))
    return summaries

# Get predictions
test_summaries = evaluate_summaries(model, tokenizer, small_test_dataset)

# Compute ROUGE
results = metric.compute(predictions=test_summaries, references=small_test_dataset['summary'])
print("ROUGE Scores:", results)

ROUGE Scores: {'rouge1': 0.4207632431386984, 'rouge2': 0.19567476657665828, 'rougeL': 0.3402484112193753, 'rougeLsum': 0.3399655224916929}


In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Specify the path to your saved model directory
model_path = "./saved_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./trained_tokenizer_bart_large")

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model_bart_large")


In [17]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./trained_tokenizer_t5-small")

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model_t5-small")

# Generate predictions for the first 10 rows
predictions = []
for i in range(10):
    # Get the input document
    input_text = tokenized_test[i]["document"]  # Replace "document" with your actual column name

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=55, min_length=5, length_penalty=2.0, num_beams=4)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Save the generated summary
    predictions.append(generated_summary)

# Retrieve actual documents and reference summaries
actual_documents = [tokenized_val[i]["document"] for i in range(10)]  # Replace "document" with your actual column name
reference_summaries = [tokenized_val[i]["summary"] for i in range(10)]  # Replace "summary" with your actual column name


# Create a DataFrame for better visualization
results_df = pd.DataFrame({
    "Actual Document": actual_documents,
    "Reference Summary": reference_summaries,
    "Generated Summary": predictions
})


from IPython.display import display
display(results_df)


results_df.to_csv("summary_predictions_with_t5-small.csv", index=False)


Unnamed: 0,Actual Document,Reference Summary,Generated Summary
0,"Patrick Joseph Connors, 59, his son Patrick De...",Three family members have been jailed for forc...,A woman who died after a minibus was hit by a ...
1,The visitors were closing in on three points t...,Championship leaders Hibernian twice came from...,A new manor in Dunham Massey has been transfor...
2,"Kamiyah Mobley, who was abducted in July 1998,...",A girl stolen as a newborn from a hospital in ...,Walt Disney World has installed a lighthouse t...
3,The Global Slavery Index 2013 says India has t...,Nearly 30 million people around the world are ...,Northern Ireland is hoping to sell beef and ch...
4,The duo impressed against New Zealand last wee...,Northern Ireland boss Michael O'Neill looks se...,Blackpool have signed Blackpool midfielder Dav...
5,Seven of the group at Pembroke Comprehensive S...,A group of pupils have been treated at a Pembr...,Scotland head coach Mark Strachan says he will...
6,Ms Deacon will succeed Ian McKay when he stand...,Former Scottish health minister Susan Deacon i...,A man has been jailed for four-and-a-half year...
7,El Nacional in the Dominican Republic has now ...,Actor Alec Baldwin's impression on Saturday Ni...,A man has been charged with attempted murder a...
8,Official documents obtained by Reuters news ag...,The US government is concerned it could be imp...,Wakefield Dragons midfielder Aiton has been di...
9,16 May 2016 Last updated at 08:42 BST\nMerafie...,Footage has been released showing the demoliti...,Coventry City Council has met with the club's ...


In [16]:
# Predict summaries for the first 10 documents
predictions = []
documents = tokenized_test["document"][:10]  # Replace "document" with your dataset's input column name

for doc in documents:
    # Tokenize the input document
    inputs = tokenizer(doc, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=128, min_length=30, length_penalty=2.0, num_beams=4)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Store the generated summary
    predictions.append(generated_summary)


# Retrieve actual documents and reference summaries
actual_documents = [tokenized_val[i]["document"] for i in range(10)]  # Replace "document" with your actual column name
reference_summaries = [tokenized_val[i]["summary"] for i in range(10)]  # Replace "summary" with your actual column name


# Create a DataFrame for better visualization
results_df = pd.DataFrame({
    "Actual Document": actual_documents,
    "Reference Summary": reference_summaries,
    "Generated Summary": predictions
})


from IPython.display import display
display(results_df)


results_df.to_csv("summary_predictions_with_bart_large.csv", index=False)


Unnamed: 0,Actual Document,Reference Summary,Generated Summary
0,"Patrick Joseph Connors, 59, his son Patrick De...",Three family members have been jailed for forc...,A woman who was on her way to her hen party wh...
1,The visitors were closing in on three points t...,Championship leaders Hibernian twice came from...,"A 1,000-year-old hall has reopened to the publ..."
2,"Kamiyah Mobley, who was abducted in July 1998,...",A girl stolen as a newborn from a hospital in ...,A lighthouse has been erected at Walt Disney W...
3,The Global Slavery Index 2013 says India has t...,Nearly 30 million people around the world are ...,The agriculture minister has said she hopes to...
4,The duo impressed against New Zealand last wee...,Northern Ireland boss Michael O'Neill looks se...,Blackpool have signed Leicester City defender ...
5,Seven of the group at Pembroke Comprehensive S...,A group of pupils have been treated at a Pembr...,"Gordon Strachan insists there is ""no getting a..."
6,Ms Deacon will succeed Ian McKay when he stand...,Former Scottish health minister Susan Deacon i...,A man jailed for filming a sex offence in Dund...
7,El Nacional in the Dominican Republic has now ...,Actor Alec Baldwin's impression on Saturday Ni...,A man from Ireland has appeared in court in co...
8,Official documents obtained by Reuters news ag...,The US government is concerned it could be imp...,Newport Gwent Dragons half-back Brett Ferres h...
9,16 May 2016 Last updated at 08:42 BST\nMerafie...,Footage has been released showing the demoliti...,Coventry City Football Club has asked the city...
