In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Libraries

In [2]:
!pip install transformers datasets torch newspaper3k lxml_html_clean
!pip install nltk

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)


### Import Libraries

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import pandas as pd
import nltk
from newspaper import Article
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Load & Preprocess Your Data

In [4]:
df = pd.read_csv("/content/drive/MyDrive/NLP/Project/Dataset/train.csv")
df.columns = ['class_id', 'title', 'description']
df['text'] = df['title'].fillna('') + ". " + df['description'].fillna('')

category_map = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci/Tech'
}
df['label'] = df['class_id'].map(category_map)

# Encode labels numerically
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

# Use only text and label_id for BERT
dataset = Dataset.from_pandas(df[['text', 'label_id']])


### Tokenize Text for BERT

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

### Train BERT Classifier

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

training_args = TrainingArguments(
    output_dir="/content/bert_news",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss


### Save and Load BERT Model

In [None]:
model.save_pretrained("bert_news_classifier")
tokenizer.save_pretrained("bert_news_classifier")

# To load later
model = BertForSequenceClassification.from_pretrained("bert_news_classifier")
tokenizer = BertTokenizer.from_pretrained("bert_news_classifier")


### Prediction Pipeline for URL

In [None]:
def fetch_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.title, article.text

def summarize(text, top_n=2, max_words=40):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return ' '.join(sentences)

    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = cosine_similarity(tfidf)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
    summary = []
    total_words = 0

    for _, sentence, idx in sorted(ranked[:len(sentences)], key=lambda x: x[2]):
        word_count = len(sentence.split())
        if total_words + word_count <= max_words:
            summary.append(sentence)
            total_words += word_count
        if len(summary) >= top_n or total_words >= max_words:
            break

    return ' '.join(summary)

def infer_news_bert(url):
    title, text = fetch_article(url)
    full_text = title + ". " + text
    inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted = torch.argmax(probs, dim=1).item()
    label = label_encoder.inverse_transform([predicted])[0]
    summary = summarize(text)
    return {
        "title": title,
        "category": label,
        "summary": summary
    }


### Run the Model

In [None]:
url = "https://timesofindia.indiatimes.com/india/g7-summit-pm-modi-calls-for-global-action-on-terrorism-highlights-smuggling-and-repression-top-quotes/articleshow/121920189.cms"
result = infer_news_bert(url)

print("📰 Title:\n", result['title'])
print("\n📂 Predicted Category:", result['category'])
print("\n📝 Summary:\n", result['summary'])


### Save Model & Tokenizer

In [None]:
model.save_pretrained("/content/bert_news_classifier")
tokenizer.save_pretrained("/content/bert_news_classifier")

### Zip it

In [None]:
!zip -r bert_news_classifier.zip bert_news_classifier

### download the zip

In [None]:
from google.colab import files
files.download("bert_news_classifier.zip")