In [1]:
import zipfile
import os

zip_path = "/content/tokenized_data1.zip"  # Google Colab path
extract_path = "/content/tokenized_data1"      # Destination folder

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Optional: list files to confirm
os.listdir(extract_path)


['tokenized data']

In [18]:
from transformers import AutoTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_from_disk
import torch, gc

# Clean memory
gc.collect()
torch.cuda.empty_cache()

from datasets import load_from_disk, DatasetDict
import os

# === Load each split individually ===
train_dataset = load_from_disk("/content/tokenized_data1/tokenized_data/train")
val_dataset = load_from_disk("/content/tokenized_data1/tokenized_data/validation")
test_dataset = load_from_disk("/content/tokenized_data1/tokenized_data/test")

# === Combine into a single DatasetDict ===
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})


# Load BART model and tokenizer
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.model_max_length = 1024  # Lower max length to fit T4 GPU

model = BartForConditionalGeneration.from_pretrained(model_name)

# Optional: truncate inputs for safety
def truncate_inputs(example):
    example["input_ids"] = example["input_ids"][:1024]
    example["attention_mask"] = example["attention_mask"][:1024]
    return example

dataset = dataset.map(truncate_inputs)

# Define training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,  # T4 can handle 2 usually
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=3e-5,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=2,
    fp16=True,  # T4 supports mixed precision
    report_to="none"
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/861 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

  trainer = Trainer(


In [19]:
trainer.train()

Step,Training Loss,Validation Loss
200,0.8678,0.605289
400,0.6204,0.526158
600,0.6815,0.493915
800,0.6327,0.470914
1000,0.5472,0.460769
1200,0.5474,0.44637
1400,0.4848,0.443054
1600,0.4797,0.432368
1800,0.4704,0.430104
2000,0.5976,0.423552




TrainOutput(global_step=2586, training_loss=0.7009126592401562, metrics={'train_runtime': 1690.3765, 'train_samples_per_second': 6.116, 'train_steps_per_second': 1.53, 'total_flos': 6303455505285120.0, 'train_loss': 0.7009126592401562, 'epoch': 2.0})

In [20]:
# Save model and tokenizer to a directory
save_directory = "./trained_bart_model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


('./trained_bart_model/tokenizer_config.json',
 './trained_bart_model/special_tokens_map.json',
 './trained_bart_model/vocab.json',
 './trained_bart_model/merges.txt',
 './trained_bart_model/added_tokens.json',
 './trained_bart_model/tokenizer.json')

In [27]:
pip install transformers datasets rouge_score




In [28]:
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_metric
from torch.utils.data import DataLoader
import torch

# Load model and tokenizer
model_path = "/content/trained_bart_model"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample evaluation data
# Replace this with your actual dataset
eval_texts = [
    "The Eiffel Tower is located in Paris.",
    "Machine learning enables computers to learn from data."
]
eval_summaries = [
    "Eiffel Tower is in Paris.",
    "ML lets computers learn from data."
]

# Tokenize evaluation inputs
inputs = tokenizer(eval_texts, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate predictions
with torch.no_grad():
    summaries_ids = model.generate(
        inputs["input_ids"],
        num_beams=4,
        max_length=50,
        early_stopping=True
    )

# Decode predictions
predictions = tokenizer.batch_decode(summaries_ids, skip_special_tokens=True)
references = [[ref] for ref in eval_summaries]

# Compute ROUGE score
rouge = load_metric("rouge")
results = rouge.compute(predictions=predictions, references=eval_summaries)

# Print results
print("Generated Summaries:")
for i, (inp, pred) in enumerate(zip(eval_texts, predictions)):
    print(f"\nInput {i+1}: {inp}")
    print(f"Predicted: {pred}")
    print(f"Reference: {eval_summaries[i]}")

print("\nROUGE Scores:")
for key in results:
    print(f"{key}: {results[key].mid.fmeasure:.4f}")


  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Generated Summaries:

Input 1: The Eiffel Tower is located in Paris.
Predicted: The Eiffel Tower is located in Paris.
Reference: Eiffel Tower is in Paris.

Input 2: Machine learning enables computers to learn from data.
Predicted: Machine Learning enables computers to learn from data.
Reference: ML lets computers learn from data.

ROUGE Scores:
rouge1: 0.7024
rouge2: 0.4667
rougeL: 0.7024
rougeLsum: 0.7024


In [32]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load your saved model and tokenizer
model_path = "/content/trained_bart_model"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Set model to evaluation mode and move to device
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 🔸 Your custom input
custom_input = "I Please close the voting. Announce the results. Nine Ice nine Ice Council Bill 153 has been ordered publish Madam Secretary, put the next item on our screen, which is one I believe is 161. I move that council bill 161 be held in committee and brought back to the floor of council on Monday, March 20"

# Tokenize input
inputs = tokenizer([custom_input], return_tensors="pt", truncation=True, padding=True).to(device)

# Generate output
with torch.no_grad():
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=4,
        max_length=50,
        early_stopping=True
    )

# Decode and print result
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\n🧾 Custom Input:")
print(custom_input)

print("\n📌 Model Output:")
print(generated_text)



🧾 Custom Input:
I Please close the voting. Announce the results. Nine Ice nine Ice Council Bill 153 has been ordered publish Madam Secretary, put the next item on our screen, which is one I believe is 161. I move that council bill 161 be held in committee and brought back to the floor of council on Monday, March 20

📌 Model Output:
A bill for an ordinance approving a proposed Agreement between the City and County of Denver and the Colorado Department of Parks and Recreation to provide for the execution of a proposed Intergovernmental Agreement between Denver and Colorado Parks & Recreation. Approves a contract


In [33]:
!zip -r /content/trained_bart_model.zip /content/trained_bart_model


  adding: content/trained_bart_model/ (stored 0%)
  adding: content/trained_bart_model/merges.txt (deflated 53%)
  adding: content/trained_bart_model/model.safetensors (deflated 7%)
  adding: content/trained_bart_model/vocab.json (deflated 59%)
  adding: content/trained_bart_model/generation_config.json (deflated 47%)
  adding: content/trained_bart_model/tokenizer.json (deflated 82%)
  adding: content/trained_bart_model/config.json (deflated 64%)
  adding: content/trained_bart_model/tokenizer_config.json (deflated 76%)
  adding: content/trained_bart_model/special_tokens_map.json (deflated 52%)
