#PROJECT 2: News Headline Classification using SVM

#Step 1: Install dependencies

In [1]:
!pip install scikit-learn
!pip install pandas



#Step 2: Import libraries

In [2]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Step 3: Load a news dataset
Using scikit-learn's "20 Newsgroups" dataset (easy & built-in)

In [3]:
print("Loading dataset...")

categories = ['talk.politics.misc', 'sci.electronics', 'rec.sport.baseball', 'rec.autos']

data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

print("Sample news headline:")
print(data.data[0][:300])

Loading dataset...
Sample news headline:
From: bowmanj@csn.org (Jerry Bowman)
Subject: Re: Help!!!(With Buick Regal)
Nntp-Posting-Host: fred.colorado.edu
Organization: University of Colorado Boulder, OCS
Lines: 36

In article <C5wyKp.L98@ulowell.ulowell.edu> gvahe@cs.ulowell.edu (Gerry Vahe) writes:
>
>
>HHHEEELLLPPP!!!!
>
>---I'm Going cr


# Step 4: Create a DataFrame for easier handling

In [5]:
df = pd.DataFrame({"text": data.data, "target": data.target})
print("\nDataset shape:", df.shape)
df.head()


Dataset shape: (3743, 2)


Unnamed: 0,text,target
0,From: bowmanj@csn.org (Jerry Bowman)\nSubject:...,0
1,From: ssave@ole.cdac.com (The Devil Reincarnat...,0
2,From: rscharfy@magnus.acs.ohio-state.edu (Ryan...,3
3,From: rbeckham@bnr.ca (Rick Beckham)\nSubject:...,3
4,From: eczcaw@mips.nott.ac.uk (A.Wainwright)\nS...,3


# Step 5: Split into training & testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["target"], test_size=0.2, random_state=42
)

# Step 6: Convert text into TF-IDF vectors

In [7]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("\nTF-IDF vector shape:", X_train_tfidf.shape)


TF-IDF vector shape: (2994, 5000)


# Step 7: Build SVM model

In [10]:
model = LinearSVC()
print("\nTraining the SVM model...")
model.fit(X_train_tfidf, y_train)


Training the SVM model...


# Step 8: Evaluate model

In [11]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9719626168224299

Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       191
           1       0.99      0.99      0.99       200
           2       0.96      0.96      0.96       201
           3       0.99      0.97      0.98       157

    accuracy                           0.97       749
   macro avg       0.97      0.97      0.97       749
weighted avg       0.97      0.97      0.97       749



# Step 9: Test your own headlines

In [14]:
sample_headlines = [
    "New 3D rendering engine improves computer graphics performance",
    "Apple releases new Mac hardware update with better processors",
    "The Yankees win in extra innings during last night’s baseball game",
    "Toyota announces new hybrid car with improved mileage",
    "NASA prepares mission to explore the surface of Mars",
    "Government announces new policy changes affecting citizens",
    "Doctors discover new method to treat viral infection",
    "Church leaders gather for annual Christian conference",
    "Researchers build new circuit board with faster switching speed",
    "Debate rises over modern religious practices across the world"
]

sample_tfidf = tfidf.transform(sample_headlines)
predictions = model.predict(sample_tfidf)

print("\nCustom Predictions:")
for i, headline in enumerate(sample_headlines):
    print(f"{headline} ---> {categories[predictions[i]]}")



Custom Predictions:
New 3D rendering engine improves computer graphics performance ---> talk.politics.misc
Apple releases new Mac hardware update with better processors ---> rec.sport.baseball
The Yankees win in extra innings during last night’s baseball game ---> sci.electronics
Toyota announces new hybrid car with improved mileage ---> talk.politics.misc
NASA prepares mission to explore the surface of Mars ---> rec.sport.baseball
Government announces new policy changes affecting citizens ---> rec.autos
Doctors discover new method to treat viral infection ---> rec.sport.baseball
Church leaders gather for annual Christian conference ---> rec.sport.baseball
Researchers build new circuit board with faster switching speed ---> rec.sport.baseball
Debate rises over modern religious practices across the world ---> rec.autos
