<a href="https://colab.research.google.com/github/180030814-GnaneshwarReddy/INFO-Group-9/blob/main/Group_09_Text_summerization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install scikit-learn
!pip install matplotlib
!pip install pandas
!pip install rouge_score
!pip install numpy
!pip install nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d4219bd5d238920c203693c17ee97ba6bd234bef0a2dca230060e4bb8a075712
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5ForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge_score import rouge_scorer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Load the datasets from Google Colab (ensure you've uploaded them)
train_file = '/content/Combined_Train1000 (2).csv'
generated_file = '/content/Generated_Train1000 (2).csv'
final_combined_file = '/content/Combined_Train1000 (2).csv'

# Load the CSV files
train_df = pd.read_csv(train_file)
generated_df = pd.read_csv(generated_file)
final_combined_df = pd.read_csv(final_combined_file)

# Reduce dataset size for faster tests (limit to first 500 rows)
final_combined_df = final_combined_df.head(500)

# Assuming 'article' as input text and 'highlights' as the target text for summarization
X = final_combined_df['article']
y = final_combined_df['highlights']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Transformer-based Model Summarization (T5, PEGASUS, BART) ---
# Function for summarization
def summarize_model(model, tokenizer, texts, max_len=512, summary_len=150):
    try:
        inputs = tokenizer(texts, return_tensors="pt", max_length=max_len, truncation=True, padding="longest")
        summaries = model.generate(inputs['input_ids'], max_length=summary_len, num_beams=2, early_stopping=True)
        return [tokenizer.decode(s, skip_special_tokens=True) for s in summaries]
    except Exception as e:
        print(f"Error during summarization: {e}")
        return []

# --- T5 Model ---
try:
    t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
    t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

    # Define a summarize function for T5
    def summarize_t5(text_list):
        summaries = []
        for text in text_list:
            input_ids = t5_tokenizer.encode(f"summarize: {text}", return_tensors="pt", truncation=True)
            outputs = t5_model.generate(input_ids, max_length=50, num_beams=2, early_stopping=True)
            summary = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
            summaries.append(summary)
        return summaries

    # Summarize the first 5 texts from the test set
    summaries_t5 = summarize_t5(X_test[:5].tolist())

    print("\n--- T5 Summaries ---")
    for idx, (input_text, summary) in enumerate(zip(X_test[:5], summaries_t5), start=1):
        print(f"Original Text {idx}: {input_text}")
        print(f"T5 Summary {idx}: {summary}\n")
except Exception as e:
    print(f"T5 Model Error: {e}")

# --- PEGASUS Model ---
try:
    pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
    pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
    def summarize_pegasus(text_list):
      summaries = []
      for text in text_list:
          inputs = pegasus_tokenizer(text, truncation=True, padding='longest', return_tensors="pt")
          outputs = pegasus_model.generate(**inputs)
          summary = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)
          summaries.append(summary)
      return summaries


    # Summarize the first 5 texts from the test set
    summaries_pegasus = summarize_pegasus(X_test[:5].tolist())


    print("\n--- PEGASUS Summaries ---")
    for idx, (input_text, summary) in enumerate(zip(X_test[:5], summaries_pegasus), start=1):
        print(f"Original Text {idx}: {input_text}")
        print(f"PEGASUS Summary {idx}: {summary}\n")

except Exception as e:
    print(f"PEGASUS Model Error: {e}")


# --- BART Model ---
try:
    from transformers import BartTokenizer, BartForConditionalGeneration

    # Load the BART model and tokenizer
    bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    def summarize_bart(texts):
        summaries = []
        for text in texts:
            inputs = bart_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
            summary_ids = bart_model.generate(inputs, max_length=130, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
            summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
        return summaries

    # Summarize the first 5 texts from the test set
    summaries_bart = summarize_bart(X_test[:5].tolist())

    print("\n--- BART Summaries ---")
    for idx, (input_text, summary) in enumerate(zip(X_test[:5], summaries_bart), start=1):
        print(f"Original Text {idx}: {input_text}")
        print(f"BART Summary {idx}: {summary}\n")

except Exception as e:
    print(f"BART Model Error: {e}")


# --- Evaluation Functions ---
def evaluate_summaries_rouge(references, predictions):
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [rouge.score(ref, pred) for ref, pred in zip(references, predictions)]
    return scores

def evaluate_summaries_bleu(references, predictions):
    smooth_fn = SmoothingFunction().method1  # Smoothing to handle brevity or missing n-grams
    scores = []
    for ref, pred in zip(references, predictions):
        reference_tokens = ref.split()  # Tokenize reference
        prediction_tokens = pred.split()  # Tokenize prediction
        score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smooth_fn)
        scores.append(score)
    return scores

# Prepare the ground-truth references
references = y_test[:5].tolist()

# --- Evaluate Transformer Models ---
def evaluate_summaries():
    # Evaluate T5
    if summaries_t5:
        t5_rouge_scores = evaluate_summaries_rouge(references, summaries_t5)
        t5_bleu_scores = evaluate_summaries_bleu(references, summaries_t5)
        print("\nROUGE Scores for T5:")
        for i, score in enumerate(t5_rouge_scores):
            print(f"Sample {i+1}: {score}")
        print("BLEU Scores for T5:", t5_bleu_scores)

    # Evaluate PEGASUS
    if summaries_pegasus:
        pegasus_rouge_scores = evaluate_summaries_rouge(references, summaries_pegasus)
        pegasus_bleu_scores = evaluate_summaries_bleu(references, summaries_pegasus)
        print("\nROUGE Scores for PEGASUS:")
        for i, score in enumerate(pegasus_rouge_scores):
            print(f"Sample {i+1}: {score}")
        print("BLEU Scores for PEGASUS:", pegasus_bleu_scores)

    # Evaluate BART
    if summaries_bart:
        bart_rouge_scores = evaluate_summaries_rouge(references, summaries_bart)
        bart_bleu_scores = evaluate_summaries_bleu(references, summaries_bart)
        print("\nROUGE Scores for BART:")
        for i, score in enumerate(bart_rouge_scores):
            print(f"Sample {i+1}: {score}")
        print("BLEU Scores for BART:", bart_bleu_scores)

evaluate_summaries()




--- T5 Summaries ---
Original Text 1: Doctors have urged medicine manufacturers to make sweet-tasting Calpol less appealing to children . With 12 million bottles sold each year, Calpol is a medicine cabinet staple in most homes, soothing childhood aches and pains thanks to its main ingredient – paracetamol. But now a leading doctor has warned that children could be at risk of accidental overdose from Calpol and other brightly coloured drugs that ‘look more like milkshake than medicines’. Doctors say that Calpol’s syrup-sweet strawberry taste and pink colouring also makes it ‘almost irresistible’ to some youngsters. ‘Some children go to alarming lengths to get their hands on it while their parents’ backs are turned,’ said one. Health professionals have now urged Calpol’s manufacturers to make the medicine less appealing to youngsters to prevent accidental overdose of paracetamol, which can cause serious liver damage and, in rare cases, prove fatal. The plea comes as figures obtained by

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- PEGASUS Summaries ---
Original Text 1: Doctors have urged medicine manufacturers to make sweet-tasting Calpol less appealing to children . With 12 million bottles sold each year, Calpol is a medicine cabinet staple in most homes, soothing childhood aches and pains thanks to its main ingredient – paracetamol. But now a leading doctor has warned that children could be at risk of accidental overdose from Calpol and other brightly coloured drugs that ‘look more like milkshake than medicines’. Doctors say that Calpol’s syrup-sweet strawberry taste and pink colouring also makes it ‘almost irresistible’ to some youngsters. ‘Some children go to alarming lengths to get their hands on it while their parents’ backs are turned,’ said one. Health professionals have now urged Calpol’s manufacturers to make the medicine less appealing to youngsters to prevent accidental overdose of paracetamol, which can cause serious liver damage and, in rare cases, prove fatal. The plea comes as figures obtain