### MOUNT GOOGLE DRIVE

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### INSTALL REQUIRED LIBRARIES

In [20]:
!pip install newspaper3k scikit-learn nltk networkx joblib
!pip install lxml_html_clean
!pip install newspaper

Collecting newspaper
  Downloading newspaper-0.1.0.7.tar.gz (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.9/176.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


### IMPORT LIBRARIES

In [21]:
import requests
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from joblib import dump, load
from newspaper import Article
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### TEXT PREPROCESSING

In [22]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

### LOAD FILTERED DATASET

In [23]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Project/Dataset/Filtered_News_Category.csv')
df = df[['text', 'category']]
df.dropna(inplace=True)
df['cleaned'] = df['text'].apply(clean_text)

### TRAIN CLASSIFIER

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['category'], test_size=0.2, random_state=42)

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
dump(model, "news_classifier_huffpost.joblib")

               precision    recall  f1-score   support

         ARTS       0.94      0.05      0.10       307
     BUSINESS       0.70      0.31      0.43      1234
       COMEDY       0.70      0.24      0.35      1064
        CRIME       0.77      0.54      0.63       726
    EDUCATION       1.00      0.03      0.06       192
ENTERTAINMENT       0.68      0.81      0.74      3494
  ENVIRONMENT       0.82      0.12      0.21       291
 FOOD & DRINK       0.78      0.73      0.76      1224
HOME & LIVING       0.89      0.58      0.70       961
     POLITICS       0.72      0.94      0.81      7184
      SCIENCE       0.87      0.23      0.36       445
       SPORTS       0.83      0.53      0.65      1017
         TECH       0.80      0.21      0.33       404
       TRAVEL       0.70      0.78      0.74      1893
     WELLNESS       0.67      0.88      0.76      3519
   WORLD NEWS       0.78      0.27      0.41       661

     accuracy                           0.71     24616
    macr

['news_classifier_huffpost.joblib']

### SUMMARIZER FUNCTION

In [25]:
def summarize(text, top_n=2, max_words=40):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return ' '.join(sentences[:top_n])

    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = cosine_similarity(tfidf)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
    summary = []
    total_words = 0

    for _, sentence, idx in sorted(ranked[:len(sentences)], key=lambda x: x[2]):
        word_count = len(sentence.split())
        if total_words + word_count <= max_words:
            summary.append(sentence)
            total_words += word_count
        if len(summary) >= top_n or total_words >= max_words:
            break

    return ' '.join(summary)


### FETCH + INFER

In [26]:
def fetch_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.title, article.text

def infer_news_category_and_summary(url):
    clf = load("news_classifier_huffpost.joblib")
    title, text = fetch_article(url)
    cleaned = clean_text(title + ". " + text)
    category = clf.predict([cleaned])[0]
    summary = summarize(text)
    return {
        "title": title,
        "category": category,
        "summary": summary
    }

### SAMPLE URL

In [32]:
url = "https://timesofindia.indiatimes.com/sports/cricket/ind-vs-eng-live-score-today-cricket-match-india-national-cricket-team-vs-england-cricket-team-1st-test-match-day-3-scorecard/liveblog/122003050.cms"
result = infer_news_category_and_summary(url)

print("📰 Title:\n", result['title'])
print("\n📂 Predicted Category:", result['category'])
print("\n📝 Summary:\n", result['summary'])

📰 Title:
 IND vs ENG Live Score, 1st Test Match Day 3: Jasprit Bumrah and co eye for early breakthrough

📂 Predicted Category: TRAVEL

📝 Summary:
 The next ODI World Cup is set to take place in South Africa, Zimbabwe, and Namibia. By then, Kohli will be 38 years old, and Rohit will be approaching 40.
