In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Load dataset
df = pd.read_csv('/content/data_news.csv')

# Show first few rows
df.head()

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods


In [4]:
# Combine text fields (headline + short_description + keywords)
df['text'] = df['headline'].fillna('') + ' ' + df['short_description'].fillna('') + ' ' + df['keywords'].fillna('')

In [5]:
# Drop unused columns
df = df[['category', 'text']]

In [6]:
# Drop missing or empty values
df.dropna(inplace=True)
df = df[df['text'].str.strip() != '']

In [7]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [8]:
# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

In [9]:
# Stopwords and Lemmatization
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop]
    return ' '.join(words)

In [11]:
df['processed_text'] = df['clean_text'].apply(preprocess)

In [12]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['processed_text'])

# Target
y = df['category']

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

# Training and Evaluation
for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, preds))
    print("Classification Report:\n", classification_report(y_test, preds))


Logistic Regression
Accuracy: 0.7987
Classification Report:
                 precision    recall  f1-score   support

      BUSINESS       0.73      0.78      0.76       955
 ENTERTAINMENT       0.77      0.78      0.77       985
  FOOD & DRINK       0.85      0.82      0.84      1021
     PARENTING       0.78      0.76      0.77      1030
      POLITICS       0.79      0.74      0.77      1034
        SPORTS       0.86      0.89      0.88       995
STYLE & BEAUTY       0.86      0.85      0.85       986
        TRAVEL       0.83      0.80      0.82      1008
      WELLNESS       0.73      0.76      0.74      1009
    WORLD NEWS       0.79      0.81      0.80       977

      accuracy                           0.80     10000
     macro avg       0.80      0.80      0.80     10000
  weighted avg       0.80      0.80      0.80     10000


Naive Bayes
Accuracy: 0.7831
Classification Report:
                 precision    recall  f1-score   support

      BUSINESS       0.71      0.73     

In [20]:
# Save trained models
trained_models = {
    "Logistic Regression": models["Logistic Regression"],
    "Naive Bayes": models["Naive Bayes"],
    "SVM": models["SVM"]
}

# Prediction Function
def predict_category(user_input, chosen_model):
    # Clean and preprocess input
    cleaned = clean_text(user_input)
    processed = preprocess(cleaned)

    # Transform with TF-IDF
    vectorized = tfidf.transform([processed])

    # Predict
    model = trained_models.get(chosen_model)
    if model:
        prediction = model.predict(vectorized)
        print(f"\nPredicted Category: {prediction[0]}")
    else:
        print("Invalid model choice.")

# Single input from user
print("\nEnter a news article to classify:")
user_text = input("Enter text (headline/description/keywords): ")

print("\nChoose a model for prediction:")
print("1. Logistic Regression")
print("2. Naive Bayes")
print("3. SVM")

choice = input("Enter 1, 2, or 3: ")

model_map = {
    "1": "Logistic Regression",
    "2": "Naive Bayes",
    "3": "SVM"
}

selected_model = model_map.get(choice)

if selected_model:
    predict_category(user_text, selected_model)
else:
    print("Invalid model selection.")



Enter a news article to classify:
Enter text (headline/description/keywords): Amazon reports record-breaking quarterly revenue driven by strong e-commerce sales and growth in its cloud computing division, AWS. Analysts project continued expansion as the company invests heavily in logistics and AI infrastructure.

Choose a model for prediction:
1. Logistic Regression
2. Naive Bayes
3. SVM
Enter 1, 2, or 3: 3

Predicted Category: BUSINESS
