In [None]:
!pip install -q transformers sentencepiece tqdm pandas spacy geopy python-dotenv kaggle
!pip install -q torch
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install kaggle




In [1]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"rosiepark","key":"52727c8eb54666c8b450d9f97b632b51"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail


Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 98% 495M/503M [00:01<00:00, 374MB/s]
100% 503M/503M [00:01<00:00, 470MB/s]


In [4]:
!unzip newspaper-text-summarization-cnn-dailymail.zip


Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


In [5]:
import pandas as pd
df = pd.read_csv("/content/cnn_dailymail/train.csv")
df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [6]:
import pandas as pd
df = pd.read_csv("/content/cnn_dailymail/train.csv")  #preprocessing
print("Columns available in the DataFrame:", df.columns)
df = df.dropna(subset=['article'])
df['article'] = df['article'].astype(str).str.strip()
df['article'] = df['article'].apply(lambda x: x[:2000])
df = df.reset_index(drop=True)
df.to_csv("/content/train_preprocessed.csv", index=False)

print("Preprocessing complete. Shape:", df.shape)

Columns available in the DataFrame: Index(['id', 'article', 'highlights'], dtype='object')
Preprocessing complete. Shape: (287113, 3)


In [7]:
df2=pd.read_csv('/content/train_preprocessed.csv') #viewing preprocessed data
df2.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [8]:
%%bash
cat > summarization_geo_pipeline.py <<'PY'


import argparse
import json
import os
import shelve
from typing import List, Dict, Optional

import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import spacy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter



# Summarizer class

class Summarizer:
    def __init__(self, model_name: str):
        self.summarizer = pipeline("summarization", model=model_name)

    def summarize_batch(self, texts: List[str], max_length: int = 150, batch_size: int = 8) -> List[str]:
        results = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            try:
                outs = self.summarizer(batch, max_length=max_length, truncation=True)
                results.extend([o['summary_text'] for o in outs])
            except Exception as e:
                print(f"Error summarizing batch {i}: {e}")
                results.extend([""] * len(batch))
        return results



# GeoExtractor class

class GeoExtractor:
    def __init__(self, cache_path: str = "geo_cache"):
        self.nlp = spacy.load("en_core_web_sm")
        self.geolocator = Nominatim(user_agent="geo_pipeline")
        self.geocode = RateLimiter(self.geolocator.geocode, min_delay_seconds=1)
        self.cache = shelve.open(cache_path)

    def extract_places(self, text: str) -> List[str]:
        doc = self.nlp(text)
        return [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]

    def geocode_place(self, place_name: str) -> Optional[Dict]:
        key = place_name.lower().strip()
        if key in self.cache:
            return self.cache[key]
        try:
            loc = self.geocode(place_name)
            result = {"name": place_name, "lat": loc.latitude, "lon": loc.longitude} if loc else None
        except Exception as e:
            print(f"Geocoding error for {place_name}: {e}")
            result = None
        self.cache[key] = result
        self.cache.sync()
        return result

    def geocode_list(self, places: List[str]) -> List[Dict]:
        results = []
        for p in places:
            res = self.geocode_place(p)
            if res:
                results.append(res)
        return results

    def close(self):
        self.cache.close()



# Utility

def load_data(input_csv: str, text_column: str, sample: Optional[int] = None):
    df = pd.read_csv(input_csv)
    if sample:
        df = df.sample(n=sample, random_state=42).reset_index(drop=True)
    return df



# Pipeline

def run_pipeline(
    input_csv: str,
    output_csv: str,
    text_column: str = "article",
    title_column: Optional[str] = "title",
    model_name: str = "facebook/bart-large-cnn",
    batch_size: int = 8,
    max_length: int = 150,
    sample: Optional[int] = None,
):
    df = load_data(input_csv, text_column=text_column, sample=sample)
    summarizer = Summarizer(model_name=model_name)
    geo = GeoExtractor()

    texts = df[text_column].fillna("").tolist()
    titles = df[title_column].fillna("") if title_column and title_column in df.columns else [""] * len(df)
    ids = df['id'].tolist() if 'id' in df.columns else list(range(len(df)))

    summaries = summarizer.summarize_batch(texts, max_length=max_length, batch_size=batch_size)

    out_rows = []
    for i, doc_id in enumerate(tqdm(ids, desc="Processing docs")):
        text = texts[i]
        title = titles[i] if isinstance(titles, list) else titles.iloc[i]
        summary = summaries[i] if i < len(summaries) else ''
        places = geo.extract_places(text + '\n' + summary + '\n' + title)
        geocoded = geo.geocode_list(places)
        primary = geocoded[0] if geocoded else None

        out = {
            "id": doc_id,
            "title": title,
            "original_text": text,
            "summary": summary,
            "places_found": json.dumps(places, ensure_ascii=False),
            "geocoded_all": json.dumps(geocoded, ensure_ascii=False),
            "primary_lat": primary['lat'] if primary else None,
            "primary_lon": primary['lon'] if primary else None,
            "primary_place": primary['name'] if primary else None,
        }
        out_rows.append(out)

    out_df = pd.DataFrame(out_rows)
    out_df.to_csv(output_csv, index=False)

    geo.close()
    print(f"Saved results to {output_csv}")



# CLI

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", required=True, help="Input CSV path")
    parser.add_argument("--output", required=True, help="Output CSV path")
    parser.add_argument("--text-column", default="article", help="Column name for article text")
    parser.add_argument("--title-column", default="title", help="Column name for title (optional)")
    parser.add_argument("--model", default="facebook/bart-large-cnn", help="HuggingFace seq2seq model")
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument("--max-length", type=int, default=150)
    parser.add_argument("--sample", type=int, default=None, help="Sample N rows for quick testing")
    args = parser.parse_args()

    run_pipeline(
        args.input,
        args.output,
        text_column=args.text_column,
        title_column=args.title_column,
        model_name=args.model,
        batch_size=args.batch_size,
        max_length=args.max_length,
        sample=args.sample,
    )
PY


In [9]:
!python summarization_geo_pipeline.py \
  --input /content/train_preprocessed.csv \
  --output /content/out_sample1000.csv \
  --sample 1000 \
  --model "facebook/bart-large-cnn" \
  --batch-size 2 \
  --max-length 200

2025-09-26 15:14:50.801144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758899691.042914    1202 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758899691.111085    1202 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758899691.600925    1202 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1758899691.600962    1202 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1758899691.600966    1202 computation_placer.cc:177] computation placer alr

In [10]:
!ls /content


cnn_dailymail					sample_data
geo_cache.db					summarization_geo_pipeline.py
newspaper-text-summarization-cnn-dailymail.zip	train_preprocessed.csv
out_sample1000.csv


In [11]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=38130f52283f84e05f7a131487e1562de722c9db75b4db30eef281ade77f36fc
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [12]:
!pip install -q evaluate rouge-score

import evaluate
import pandas as pd

# ---- CONFIG ----
INPUT_CSV = "/content/out_sample1000.csv"  # our generated summaries CSV
REFERENCE_COL = "original_text"           # ground-truth summaries
PREDICTION_COL = "summary"                # model-generated summaries


# Load CSV
df = pd.read_csv(INPUT_CSV)

# Clean and remove empty rows
df[REFERENCE_COL] = df[REFERENCE_COL].astype(str).str.strip()
df[PREDICTION_COL] = df[PREDICTION_COL].astype(str).str.strip()
df = df[(df[REFERENCE_COL] != "") & (df[PREDICTION_COL] != "")].reset_index(drop=True)

if len(df) == 0:
    raise ValueError("No rows left with both reference and prediction.")

references = df[REFERENCE_COL].tolist()
predictions = df[PREDICTION_COL].tolist()

rouge = evaluate.load("rouge")


results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)


from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

r1_p, r1_r, r1_f = [], [], []
r2_p, r2_r, r2_f = [], [], []
rL_p, rL_r, rL_f = [], [], []

for ref, pred in zip(references, predictions):
    sc = scorer.score(ref, pred)
    r1_p.append(sc['rouge1'].precision)
    r1_r.append(sc['rouge1'].recall)
    r1_f.append(sc['rouge1'].fmeasure)
    r2_p.append(sc['rouge2'].precision)
    r2_r.append(sc['rouge2'].recall)
    r2_f.append(sc['rouge2'].fmeasure)
    rL_p.append(sc['rougeL'].precision)
    rL_r.append(sc['rougeL'].recall)
    rL_f.append(sc['rougeL'].fmeasure)

n = len(df)
print("\nAverage ROUGE scores (percentages):")
print(f"ROUGE-1 → Precision: {sum(r1_p)/n*100:.2f}%, Recall: {sum(r1_r)/n*100:.2f}%, F1: {sum(r1_f)/n*100:.2f}%")
print(f"ROUGE-2 → Precision: {sum(r2_p)/n*100:.2f}%, Recall: {sum(r2_r)/n*100:.2f}%, F1: {sum(r2_f)/n*100:.2f}%")
print(f"ROUGE-L → Precision: {sum(rL_p)/n*100:.2f}%, Recall: {sum(rL_r)/n*100:.2f}%, F1: {sum(rL_f)/n*100:.2f}%")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]


Average ROUGE scores (percentages):
ROUGE-1 → Precision: 98.88%, Recall: 16.93%, F1: 28.72%
ROUGE-2 → Precision: 89.33%, Recall: 15.14%, F1: 25.73%
ROUGE-L → Precision: 90.29%, Recall: 15.50%, F1: 26.29%


In [13]:

!pip install bert-score



Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [14]:
import pandas as pd
from bert_score import score

# caluclating bert score!
df_pred = pd.read_csv("/content/out_sample1000.csv")       # Generated summaries
df_ref = pd.read_csv("/content/train_preprocessed.csv")   # Reference summaries


n = min(len(df_pred), len(df_ref))
cands = df_pred['summary'].astype(str).head(n).tolist()
refs = df_ref['highlights'].astype(str).head(n).tolist()


P, R, F1 = score(cands, refs, lang="en", rescale_with_baseline=False)


print(f"BERTScore → Precision: {P.mean().item()*100:.2f}%, "
      f"Recall: {R.mean().item()*100:.2f}%, "
      f"F1: {F1.mean().item()*100:.2f}%")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore → Precision: 81.24%, Recall: 80.84%, F1: 81.03%


In [15]:
import pandas as pd


input_csv = "/content/out_sample1000.csv"
output_csv = "/content/out_sample1000_final1.csv"


df = pd.read_csv(input_csv)

#dropping title column at last , if user wants a title column he can skip this cell and next cell(title col was included in order to provide customization to the user needs)!!
drop_cols = ["title"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])


df = df.dropna(how="all")


if "summary" in df.columns and "original_text" in df.columns:
    df = df.dropna(subset=["summary", "original_text"])

df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.to_csv(output_csv, index=False)

print(f"Cleaned CSV saved to {output_csv}")
print("Final shape:", df.shape)
print("Columns left:", df.columns.tolist())


Cleaned CSV saved to /content/out_sample1000_final1.csv
Final shape: (1000, 8)
Columns left: ['id', 'original_text', 'summary', 'places_found', 'geocoded_all', 'primary_lat', 'primary_lon', 'primary_place']


In [16]:
df=pd.read_csv('/content/out_sample1000_final1.csv') #viewing final csv(after removing title coulmn)
df.head()

Unnamed: 0,id,original_text,summary,places_found,geocoded_all,primary_lat,primary_lon,primary_place
0,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,Temperatures soared to 17C in Brighton and Dor...,"[""Brighton"", ""Brighton beach"", ""the south coas...","[{""name"": ""Brighton"", ""lat"": 50.8214626, ""lon""...",50.821463,-0.140056,Brighton
1,023cd84001b33aed4ff0f3f5ecb0fdd2151cf543,A couple who weighed a combined 32st were sham...,"Margaret Gibson, 37, and her husband, James, 4...",[],[],,,
2,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy was bitten on the hand by a ...,"[""south coast"", ""Sydney"", ""Manly beach"", ""Moll...","[{""name"": ""south coast"", ""lat"": 33.7040161, ""l...",33.704016,-117.880437,south coast
3,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...","NEW: ""They just came to the border post and wa...","[""Istanbul"", ""Turkey"", ""Turkey"", ""Hatay"", ""Syr...","[{""name"": ""Istanbul"", ""lat"": 41.006381, ""lon"":...",41.006381,28.975872,Istanbul
4,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Aurora Australis had been cracking through...,"[""Antarctic sea"", ""Antarctic sea"", ""Australia""...","[{""name"": ""Antarctic sea"", ""lat"": -49.1969648,...",-49.196965,70.228956,Antarctic sea


In [17]:
import gradio as gr
from transformers import pipeline
import spacy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import shelve



# Summarizer class
class Summarizer:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        self.summarizer = pipeline("summarization", model=model_name)

    def summarize_text(self, text: str, max_length: int = 150) -> str:
        try:
            out = self.summarizer(text, max_length=max_length, truncation=True)
            return out[0]["summary_text"]
        except Exception as e:
            print(f"Summarization error: {e}")
            return ""


# GeoExtractor class
class GeoExtractor:
    def __init__(self, cache_path: str = "geo_cache"):
        self.nlp = spacy.load("en_core_web_sm")
        self.geolocator = Nominatim(user_agent="geo_pipeline")
        self.geocode = RateLimiter(self.geolocator.geocode, min_delay_seconds=1)
        self.cache = shelve.open(cache_path)

    def extract_places(self, text: str):
        doc = self.nlp(text)
        return [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]

    def geocode_place(self, place_name: str):
        key = place_name.lower().strip()
        if key in self.cache:
            return self.cache[key]
        try:
            loc = self.geocode(place_name)
            result = {"name": place_name, "lat": loc.latitude, "lon": loc.longitude} if loc else None
        except Exception as e:
            print(f"Geocoding error for {place_name}: {e}")
            result = None
        self.cache[key] = result
        self.cache.sync()
        return result

    def geocode_list(self, places):
        results = []
        for p in places:
            res = self.geocode_place(p)
            if res:
                results.append(res)
        return results

    def close(self):
        self.cache.close()


# Pipeline function for Gradio

def pipeline_model(text: str):
    summarizer = Summarizer()
    geo = GeoExtractor()

    # Summarize
    summary = summarizer.summarize_text(text)

    # Extract places
    places = geo.extract_places(text + "\n" + summary)

    # Geocode
    geocoded = geo.geocode_list(places)
    geo.close()

    # Formatting
    locations = [p["name"] for p in geocoded]
    latlongs = [f"{p['lat']}, {p['lon']}" for p in geocoded]

    return summary, ", ".join(locations), "; ".join(latlongs)



# Gradio Interface

demo = gr.Interface(
    fn=pipeline_model,
    inputs=gr.Textbox(
        label="Enter Text",
        lines=12,
        placeholder="Paste your text here..."
    ),
    outputs=[
        gr.Textbox(label="Summarized Text", lines=8),
        gr.Textbox(label="Location Names", lines=4),
        gr.Textbox(label="Latitudes & Longitudes", lines=4)
    ],
    title="Text Summarizer & Location Extractor",
    description="Enter text and get summary, location names, and their latitude-longitude values."
)

if __name__ == "__main__":
    demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e69599b08cd76071b6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
