In [1]:
# ===============================
# NEWS CLASSIFICATION & CLUSTERING
# (Supervised + Unsupervised)
# ===============================

import os
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans
import joblib
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = " ".join([w for w in text.split() if w not in STOPWORDS])
    return text
def train_supervised(json_path, text_col, label_col, out_dir="./artifacts"):
    os.makedirs(out_dir, exist_ok=True)
    df = pd.read_json(json_path, lines=True)
    print("Dataset Loaded:", df.shape)
    if "short_description" in df.columns:
        df[text_col] = (df[text_col].astype(str) + " " + df["short_description"].astype(str))

    df[text_col] = df[text_col].astype(str).apply(clean_text)
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_col], df[label_col], test_size=0.2, random_state=42
    )
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    clf = LogisticRegression(max_iter=200)
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print("\n Supervised Classification Results:")
    print("Accuracy:", acc)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    joblib.dump(clf, os.path.join(out_dir, "supervised_model.pkl"))
    joblib.dump(vectorizer, os.path.join(out_dir, "tfidf.pkl"))

    return clf, vectorizer
def scrape_bbc(max_articles=50):
    base_url = "https://www.bbc.com/news"
    r = requests.get(base_url)
    soup = BeautifulSoup(r.text, "lxml")

    links = [a["href"] for a in soup.select("a") if a.get("href", "").startswith("/news")]
    links = list(set(["https://www.bbc.com" + l for l in links]))[:max_articles]

    articles = []
    for url in links:
        try:
            res = requests.get(url, timeout=5)
            sp = BeautifulSoup(res.text, "lxml")
            title = sp.find("h1")
            paras = sp.find_all("p")
            if not title or not paras:
                continue
            text = " ".join([p.get_text() for p in paras])
            articles.append({"url": url, "title": title.get_text(), "text": clean_text(text)})
        except:
            continue

    df = pd.DataFrame(articles)
    print(f"✅ Scraped {len(df)} BBC articles")
    return df
def build_topic_clusters(df, out_dir="./artifacts", n_clusters=8):
    os.makedirs(out_dir, exist_ok=True)
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df["text"])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df["cluster"] = kmeans.fit_predict(X)

    joblib.dump(kmeans, os.path.join(out_dir, "cluster_model.pkl"))
    joblib.dump(vectorizer, os.path.join(out_dir, "cluster_tfidf.pkl"))
    df.to_csv(os.path.join(out_dir, "bbc_clusters.csv"), index=False)
    return df
def predict_both(text, out_dir="./artifacts"):
    text_clean = clean_text(text)
    clf = joblib.load(os.path.join(out_dir, "supervised_model.pkl"))
    tfidf = joblib.load(os.path.join(out_dir, "tfidf.pkl"))
    vec = tfidf.transform([text_clean])
    supervised_pred = clf.predict(vec)[0]
    kmeans = joblib.load(os.path.join(out_dir, "cluster_model.pkl"))
    tfidf2 = joblib.load(os.path.join(out_dir, "cluster_tfidf.pkl"))
    vec2 = tfidf2.transform([text_clean])
    cluster_pred = kmeans.predict(vec2)[0]

    return {"category": supervised_pred, "cluster": int(cluster_pred)}
if __name__ == "__main__":
    clf, vec = train_supervised(
        json_path="News_Category_Dataset_v3.json",   # <-- UPDATED FILE NAME
        text_col="headline",
        label_col="category",
        out_dir="./artifacts"
    )
    df_bbc = scrape_bbc(max_articles=20)
    clusters = build_topic_clusters(df_bbc, out_dir="./artifacts", n_clusters=6)
    print(clusters.head())
    result = predict_both(
        "The finance minister announced a new tax policy impacting banks and traders.",
        out_dir="./artifacts"
    )
    print("\nPrediction:\n", result)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BRU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset Loaded: (209527, 6)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



✅ Supervised Classification Results:
Accuracy: 0.5734023767479597

Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.34      0.16      0.21       293
ARTS & CULTURE       0.32      0.11      0.16       275
  BLACK VOICES       0.45      0.28      0.35       889
      BUSINESS       0.47      0.44      0.45      1216
       COLLEGE       0.42      0.29      0.35       202
        COMEDY       0.53      0.38      0.44      1022
         CRIME       0.51      0.51      0.51       713
CULTURE & ARTS       0.57      0.22      0.32       202
       DIVORCE       0.79      0.65      0.71       664
     EDUCATION       0.46      0.28      0.35       209
 ENTERTAINMENT       0.52      0.74      0.61      3419
   ENVIRONMENT       0.63      0.18      0.28       313
         FIFTY       0.27      0.05      0.08       263
  FOOD & DRINK       0.58      0.69      0.63      1270
     GOOD NEWS       0.46      0.13      0.21       270
         GR