MSE-2   ________

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [6]:
# Load the CSV data
df = pd.read_csv("/content/news_articles.csv")

In [18]:
# Combine title and keywords for input features
df["text"] = df["title"] + " " + df["keywords"]

# Convert categories to numerical labels
categories = df["category"].unique()
category_to_id = {cat: idx for idx, cat in enumerate(categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}
df["label"] = df["category"].map(category_to_id)

In [19]:
# Split into train/test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.3, stratify=df["label"], random_state=42
)

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [20]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate the model
labels = list(category_to_id.values())
target_names = list(category_to_id.keys())

y_pred = model.predict(X_test_vec)

In [21]:
# Predict on new example
def predict_category(title, keywords):
    text = title + " " + keywords
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    return id_to_category[pred]

# Example usage
example_title = "Big Tech Invests in New Data Centers"
example_keywords = "cloud, investment, technology"

print("Classification Report:\n")
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names, zero_division=0))
print("Predicted category:", predict_category(example_title, example_keywords))

Classification Report:

              precision    recall  f1-score   support

        tech       1.00      1.00      1.00         1
      sports       0.50      1.00      0.67         1
    business       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3

Predicted category: tech
