In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 📌 Step 1: Install Required Libraries
!pip install newspaper3k scikit-learn nltk networkx joblib
!pip install lxml_html_clean
!pip install newspaper

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [None]:
# 📌 Step 2: Import All Libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from joblib import dump, load
from newspaper import Article
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# 📌 Step 3: Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)


In [None]:
# 📌 Step 4: Load GNews Dataset
# 📍 Make sure to upload the dataset (gnews.csv) with "text" and "category" columns.
df = pd.read_csv("/content/drive/MyDrive/NLP/Project/Dataset/train.csv")
# Rename for convenience
df.columns = ['class_id', 'title', 'description']

# Map class IDs to labels
category_map = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci/Tech'
}
df['category'] = df['class_id'].map(category_map)

# Combine title and description
df['text'] = df['title'].fillna('') + ". " + df['description'].fillna('')
df['cleaned'] = df['text'].apply(clean_text)

df[['category', 'text']].head()


Unnamed: 0,category,text
0,Business,Wall St. Bears Claw Back Into the Black (Reute...
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...
4,Business,"Oil prices soar to all-time record, posing new..."


In [None]:
df.head()

Unnamed: 0,class_id,title,description,category,text,cleaned
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business,Wall St. Bears Claw Back Into the Black (Reute...,wall bear claw back black reuter reuter wall s...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,carlyl look toward commerci aerospac reuter re...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...,oil economi cloud stock outlook reuter reuter ...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business,Iraq Halts Oil Exports from Main Southern Pipe...,iraq halt oil export main southern pipelin reu...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",Business,"Oil prices soar to all-time record, posing new...",oil price soar record pose new menac us econom...


In [None]:
# 📌 Step 5: Train a Classifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['category'], test_size=0.2, random_state=42)

NB_Model= Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

NB_Model.fit(X_train, y_train)
y_pred = NB_Model.predict(X_test)

print(classification_report(y_test, y_pred))
dump(NB_Model, "news_classifier.joblib")


              precision    recall  f1-score   support

    Business       0.85      0.86      0.85      5911
    Sci/Tech       0.88      0.86      0.87      6075
      Sports       0.94      0.97      0.96      6058
       World       0.91      0.88      0.90      5956

    accuracy                           0.89     24000
   macro avg       0.89      0.89      0.89     24000
weighted avg       0.89      0.89      0.89     24000



['news_classifier.joblib']

In [None]:
def summarize(text, top_n=2, max_words=40):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return ' '.join(sentences[:top_n])

    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = cosine_similarity(tfidf)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Sort by score and sentence position to preserve natural order
    ranked = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)

    summary = []
    total_words = 0

    for _, sentence, idx in sorted(ranked[:len(sentences)], key=lambda x: x[2]):
        word_count = len(sentence.split())
        if total_words + word_count <= max_words:
            summary.append(sentence)
            total_words += word_count
        if len(summary) >= top_n or total_words >= max_words:
            break

    return ' '.join(summary)


In [None]:
# 📌 Step 7: Scraper + Inference Pipeline
def fetch_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.title, article.text

def infer_news_category_and_summary(url):
    clf = load("news_classifier.joblib")
    title, text = fetch_article(url)
    cleaned = clean_text(title + ". " + text)
    category = clf.predict([cleaned])[0]
    summary = summarize(text)
    return {
        "title": title,
        "category": category,
        "summary": summary
    }


In [None]:
# 📌 Step 8: Run the Pipeline on Live News URL
url = "https://www.thehindu.com/news/international/israel-iran-conflict-live-updates-missile-strike-khamenei-trump-june-21-2025/article69720103.ece"
result = infer_news_category_and_summary(url)

print("📰 Title:\n", result['title'])
print("\n📂 Predicted Category:", result['category'])
print("\n📝 Summary:\n", result['summary'])


📰 Title:
 Israel-Iran conflict highlights Netanyahu says Israel ‘moved closer to goals’ in Iran after U.S. bombing

📂 Predicted Category: World

📝 Summary:
 “Everybody heard those names for years as they built this horribly destructive enterprise. “Our objective was the destruction of Iran’s nuclear enrichment capacity and a stop to the nuclear threat posed by the world’s number one state sponsor of terror.


###Downloading Trained Model

In [None]:
# from joblib import dump, load
# dump(NB_Model, "news_classifier.joblib")

In [None]:
# from google.colab import files
# files.download("news_classifier.joblib")

###Deploying using gradio

In [None]:
#!pip install gradio

###UI 1

In [None]:
# import gradio as gr

# def classify_and_summarize(url):
#     result = infer_news_category_and_summary(url)
#     return result['title'], result['category'], result['summary']

# gr.Interface(
#     fn=classify_and_summarize,
#     inputs=gr.Textbox(label="Enter News URL"),
#     outputs=[
#         gr.Textbox(label="Title"),
#         gr.Textbox(label="Predicted Category"),
#         gr.Textbox(label="Summary")
#     ],
#     title="📰 News Classifier & Summarizer"
# ).launch(share=True)
