In [None]:
!pip install datasets transformers torch accelerate rouge


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge-1.0.1-py3-none-any.whl (13 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset
from transformers import BartTokenizerFast, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import torch
from sklearn.model_selection import train_test_split
from google.colab import drive
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support, accuracy_score
from rouge import Rouge

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Load Dataset
print("Loading dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir="./data", keep_in_memory=False)

# Step 2: Reduce Dataset Size for Faster Training
print("Reducing dataset size for faster training...")
small_dataset = dataset['train'].shuffle(seed=42).select(range(2000))  # Select 2000 samples

# Split the dataset into training (80%) and validation (20%)
small_dataset = small_dataset.train_test_split(test_size=0.2, seed=42)

# Use DatasetDict format
from datasets import DatasetDict
train_data = small_dataset['train']
validation_data = small_dataset['test']
small_dataset = DatasetDict({
    'train': train_data,
    'validation': validation_data
})

# Step 3: Load Tokenizer and Model
print("Loading tokenizer and model...")
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")  # Use a smaller model variant
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")  # Adjusted to a smaller variant

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 4: Define Preprocessing Function
# Step 4: Define Preprocessing Function
def preprocess_function(examples):
    inputs = [doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    labels = tokenizer(
        examples['highlights'], max_length=64, truncation=True, text_target=examples['highlights']
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Step 5: Preprocess Dataset
print("Preprocessing dataset...")
encoded_dataset = small_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["article"],  # Make sure to include 'article' and 'highlights'
    num_proc=4,  # Use 4 CPU cores for parallel preprocessing
)

# Verify the columns in the dataset
print(encoded_dataset['train'].column_names)  # Check column names to confirm 'highlights' is present


# Step 6: Define Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 7: Define Training Arguments (GPU Optimized)
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/My Drive/BartResults",  # Save model to Google Drive
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    save_strategy="steps",
    save_steps=500,  # Save checkpoint every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,  # Learning rate
    per_device_train_batch_size=2,  # Reduced batch size to 2
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 batches
    num_train_epochs=2,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision for GPU
    dataloader_num_workers=4,  # Increase workers for GPU
    report_to="none",  # Disable external logging tools
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="eval_loss",
    predict_with_generate=True,  # Enable generation during evaluation
)

# Step 8: Initialize Trainer
print("Initializing trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Step 9: Train the Model (GPU)
print("Starting training on GPU...")
trainer.train()

# Step 10: Save Model and Tokenizer
print("Saving model and tokenizer...")
model.save_pretrained("/content/drive/My Drive/TEXT_SUMMARIZER")
tokenizer.save_pretrained("/content/drive/My Drive/TEXT_SUMMARIZER")

# Step 11: Load Model and Tokenizer for Inference
print("Loading model and tokenizer for inference...")
model = BartForConditionalGeneration.from_pretrained("/content/drive/My Drive/TEXT_SUMMARIZER")
tokenizer = BartTokenizerFast.from_pretrained("/content/drive/My Drive/TEXT_SUMMARIZER")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 12: Evaluate the Model (Testing)
print("Evaluating the model...")

# Get predictions
predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in trainer.predict(encoded_dataset['validation'])[0]]

# Extract labels
labels = encoded_dataset['validation']['highlights']

# Calculate ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(predictions, labels, avg=True)
print("ROUGE Scores:", rouge_scores)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading dataset...
Reducing dataset size for faster training...
Loading tokenizer and model...
Preprocessing dataset...
['highlights', 'id', 'input_ids', 'attention_mask', 'labels']
Initializing trainer...
Starting training on GPU...


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss




Saving model and tokenizer...
Loading model and tokenizer for inference...
Evaluating the model...




ROUGE Scores: {'rouge-1': {'r': 0.13740199227296748, 'p': 0.4278936035369865, 'f': 0.2029979893966867}, 'rouge-2': {'r': 0.0495917574596946, 'p': 0.1827475857475858, 'f': 0.07598784677924506}, 'rouge-l': {'r': 0.12945787874375497, 'p': 0.4040246236606535, 'f': 0.19133293041486327}}


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load pre-trained model and tokenizer from the specified directory
model_path = '/content/drive/My Drive/TEXT_SUMMARIZER'
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Get input text from the user
inp = input("Enter the Text to summarize: ")

# Tokenize and encode the input
inputs = tokenizer(inp, max_length=1024, return_tensors="pt", truncation=True).to(device)

# Generate summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, length_penalty=2.0, min_length=30, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Enter the Text to summarize: New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. Aft

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
"""
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
