In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hgultekin/bbcnewsarchive")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\garga\.cache\kagglehub\datasets\hgultekin\bbcnewsarchive\versions\1


In [2]:
import pandas as pd

df=pd.read_csv('bbc-news-data.csv',sep='\t')
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [3]:
df.isnull().sum()

category    0
filename    0
title       0
content     0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,category,filename,title,content
count,2225,2225,2225,2225
unique,5,511,2096,2092
top,sport,001.txt,Troubled Marsh under SEC scrutiny,Music and film fans will be able to control t...
freq,511,5,2,2


In [5]:
import re

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
# Apply cleaning
df['Clean_Text'] = df['content'].apply(clean_text)
print(df['Clean_Text'].head())

0    quarterly profits at us media giant timewarner...
1    the dollar has hit its highest level against t...
2    the owners of embattled russian oil giant yuko...
3    british airways has blamed high fuel prices fo...
4    shares in uk drinks and food firm allied domec...
Name: Clean_Text, dtype: object


In [7]:
df.head()

Unnamed: 0,category,filename,title,content,Clean_Text
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,quarterly profits at us media giant timewarner...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,the dollar has hit its highest level against t...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,the owners of embattled russian oil giant yuko...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,british airways has blamed high fuel prices fo...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,shares in uk drinks and food firm allied domec...


In [8]:
df=df.drop(['filename','title','content'],axis=1)

In [9]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

df['Tokens'] = df['Clean_Text'].apply(word_tokenize)
print(df['Tokens'].head())


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0    [quarterly, profits, at, us, media, giant, tim...
1    [the, dollar, has, hit, its, highest, level, a...
2    [the, owners, of, embattled, russian, oil, gia...
3    [british, airways, has, blamed, high, fuel, pr...
4    [shares, in, uk, drinks, and, food, firm, alli...
Name: Tokens, dtype: object


In [10]:
df.head()

Unnamed: 0,category,Clean_Text,Tokens
0,business,quarterly profits at us media giant timewarner...,"[quarterly, profits, at, us, media, giant, tim..."
1,business,the dollar has hit its highest level against t...,"[the, dollar, has, hit, its, highest, level, a..."
2,business,the owners of embattled russian oil giant yuko...,"[the, owners, of, embattled, russian, oil, gia..."
3,business,british airways has blamed high fuel prices fo...,"[british, airways, has, blamed, high, fuel, pr..."
4,business,shares in uk drinks and food firm allied domec...,"[shares, in, uk, drinks, and, food, firm, alli..."


In [11]:
df=df.drop('Clean_Text',axis=1)

In [12]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print(df['Tokens'].head())


0    [quarterly, profits, us, media, giant, timewar...
1    [dollar, hit, highest, level, euro, almost, th...
2    [owners, embattled, russian, oil, giant, yukos...
3    [british, airways, blamed, high, fuel, prices,...
4    [shares, uk, drinks, food, firm, allied, domec...
Name: Tokens, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
df.head()

Unnamed: 0,category,Tokens
0,business,"[quarterly, profits, us, media, giant, timewar..."
1,business,"[dollar, hit, highest, level, euro, almost, th..."
2,business,"[owners, embattled, russian, oil, giant, yukos..."
3,business,"[british, airways, blamed, high, fuel, prices,..."
4,business,"[shares, uk, drinks, food, firm, allied, domec..."


In [14]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Choose either stemming or lemmatization
df['Tokens_Stemmed'] = df['Tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
df['Tokens_Lemmatized'] = df['Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

print(df['Tokens_Stemmed'].head())
print(df['Tokens_Lemmatized'].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [quarterli, profit, us, media, giant, timewarn...
1    [dollar, hit, highest, level, euro, almost, th...
2    [owner, embattl, russian, oil, giant, yuko, as...
3    [british, airway, blame, high, fuel, price, dr...
4    [share, uk, drink, food, firm, alli, domecq, r...
Name: Tokens_Stemmed, dtype: object
0    [quarterly, profit, u, medium, giant, timewarn...
1    [dollar, hit, highest, level, euro, almost, th...
2    [owner, embattled, russian, oil, giant, yukos,...
3    [british, airway, blamed, high, fuel, price, d...
4    [share, uk, drink, food, firm, allied, domecq,...
Name: Tokens_Lemmatized, dtype: object


In [15]:
df.head()

Unnamed: 0,category,Tokens,Tokens_Stemmed,Tokens_Lemmatized
0,business,"[quarterly, profits, us, media, giant, timewar...","[quarterli, profit, us, media, giant, timewarn...","[quarterly, profit, u, medium, giant, timewarn..."
1,business,"[dollar, hit, highest, level, euro, almost, th...","[dollar, hit, highest, level, euro, almost, th...","[dollar, hit, highest, level, euro, almost, th..."
2,business,"[owners, embattled, russian, oil, giant, yukos...","[owner, embattl, russian, oil, giant, yuko, as...","[owner, embattled, russian, oil, giant, yukos,..."
3,business,"[british, airways, blamed, high, fuel, prices,...","[british, airway, blame, high, fuel, price, dr...","[british, airway, blamed, high, fuel, price, d..."
4,business,"[shares, uk, drinks, food, firm, allied, domec...","[share, uk, drink, food, firm, alli, domecq, r...","[share, uk, drink, food, firm, allied, domecq,..."


In [16]:
df['Processed_Text'] = df['Tokens_Lemmatized'].apply(lambda x: ' '.join(x))
print(df['Processed_Text'].head())

0    quarterly profit u medium giant timewarner jum...
1    dollar hit highest level euro almost three mon...
2    owner embattled russian oil giant yukos ask bu...
3    british airway blamed high fuel price drop pro...
4    share uk drink food firm allied domecq risen s...
Name: Processed_Text, dtype: object


In [17]:
df.head()

Unnamed: 0,category,Tokens,Tokens_Stemmed,Tokens_Lemmatized,Processed_Text
0,business,"[quarterly, profits, us, media, giant, timewar...","[quarterli, profit, us, media, giant, timewarn...","[quarterly, profit, u, medium, giant, timewarn...",quarterly profit u medium giant timewarner jum...
1,business,"[dollar, hit, highest, level, euro, almost, th...","[dollar, hit, highest, level, euro, almost, th...","[dollar, hit, highest, level, euro, almost, th...",dollar hit highest level euro almost three mon...
2,business,"[owners, embattled, russian, oil, giant, yukos...","[owner, embattl, russian, oil, giant, yuko, as...","[owner, embattled, russian, oil, giant, yukos,...",owner embattled russian oil giant yukos ask bu...
3,business,"[british, airways, blamed, high, fuel, prices,...","[british, airway, blame, high, fuel, price, dr...","[british, airway, blamed, high, fuel, price, d...",british airway blamed high fuel price drop pro...
4,business,"[shares, uk, drinks, food, firm, allied, domec...","[share, uk, drink, food, firm, alli, domecq, r...","[share, uk, drink, food, firm, allied, domecq,...",share uk drink food firm allied domecq risen s...


In [18]:
df=df.drop('Tokens',axis=1)

In [19]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # limit vocab to top 5000 words

# Fit & transform the processed text
X = vectorizer.fit_transform(df['Processed_Text'])

print("Shape of TF-IDF matrix:", X.shape)  # (num_articles, num_features)


Shape of TF-IDF matrix: (2225, 5000)


In [20]:
y = df['category']  # assuming your CSV has a 'Category' column
print(y.value_counts())


category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9842696629213483
               precision    recall  f1-score   support

     business       0.98      0.97      0.98       102
entertainment       1.00      1.00      1.00        77
     politics       0.96      0.98      0.97        84
        sport       1.00      1.00      1.00       102
         tech       0.97      0.97      0.97        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

best_model = None
best_score = 0
best_name = ""
best_params = None

# Define models + parameter grids
models = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    "SVM": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }
    }
}

for name, mp in models.items():
    print(f"Running GridSearchCV for {name}...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    score = grid.best_estimator_.score(X_test, y_test)
    if score > best_score:
        best_model = grid.best_estimator_
        best_score = score
        best_name = name
        best_params = grid.best_params_

print("🏆 Best Model:", best_name)
print("🔧 Best Parameters:", best_params)
print("📊 Test Accuracy:", best_score)


Running GridSearchCV for Logistic Regression...




Running GridSearchCV for Naive Bayes...
Running GridSearchCV for SVM...
Running GridSearchCV for Random Forest...
🏆 Best Model: Logistic Regression
🔧 Best Parameters: {'C': 10, 'solver': 'liblinear'}
📊 Test Accuracy: 0.9865168539325843


In [24]:
import joblib

# Retrain best model on full training data
final_model = best_model
final_model.fit(X_train, y_train)

# Save model
joblib.dump(final_model, "best_news_classifier.pkl")
print("✅ Final model saved as best_news_classifier.pkl")


✅ Final model saved as best_news_classifier.pkl




In [25]:
df.head()

Unnamed: 0,category,Tokens_Stemmed,Tokens_Lemmatized,Processed_Text
0,business,"[quarterli, profit, us, media, giant, timewarn...","[quarterly, profit, u, medium, giant, timewarn...",quarterly profit u medium giant timewarner jum...
1,business,"[dollar, hit, highest, level, euro, almost, th...","[dollar, hit, highest, level, euro, almost, th...",dollar hit highest level euro almost three mon...
2,business,"[owner, embattl, russian, oil, giant, yuko, as...","[owner, embattled, russian, oil, giant, yukos,...",owner embattled russian oil giant yukos ask bu...
3,business,"[british, airway, blame, high, fuel, price, dr...","[british, airway, blamed, high, fuel, price, d...",british airway blamed high fuel price drop pro...
4,business,"[share, uk, drink, food, firm, alli, domecq, r...","[share, uk, drink, food, firm, allied, domecq,...",share uk drink food firm allied domecq risen s...


In [26]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def summarize_text(processed_text, top_n=3):
    """
    Extractive summarization using TF-IDF + cosine similarity.
    Takes an article, splits into sentences, ranks by importance,
    and returns top_n sentences as summary.
    """
    
    # 1️⃣ Split into sentences
    sentences = sent_tokenize(processed_text)
    
    # If short article, return full text
    if len(sentences) <= top_n:
        return processed_text
    
    # 2️⃣ TF-IDF per sentence (local vectorizer, not the classifier one)
    tfidf = TfidfVectorizer(stop_words='english')
    sentence_vectors = tfidf.fit_transform(sentences)
    
    # 3️⃣ Cosine similarity between sentences
    sim_matrix = cosine_similarity(sentence_vectors)
    
    # 4️⃣ Score sentences by sum of similarities
    scores = sim_matrix.sum(axis=1)
    
    # 5️⃣ Rank sentences and pick top_n
    ranked_sentences = [sentences[i] for i in np.argsort(-scores)]
    
    return " ".join(ranked_sentences[:top_n])


In [27]:
article = df['Processed_Text'][0]

# Categorization
X_input = vectorizer.transform([article])
predicted_category = final_model.predict(X_input)[0]
print("Category:", predicted_category)

# Summarization
summary = summarize_text(article, top_n=2)
print("Summary:\n", summary)


Category: business
Summary:
 quarterly profit u medium giant timewarner jumped bn three month december yearearlier firm one biggest investor google benefited sale highspeed internet connection higher advert sale timewarner said fourth quarter sale rose bn bn profit buoyed oneoff gain offset profit dip warner bros less user aol time warner said friday owns searchengine google internet business aol mixed fortune lost subscriber fourth quarter profit lower preceding three quarter however company said aols underlying profit exceptional item rose back stronger internet advertising revenue hope increase subscriber offering online service free timewarner internet customer try sign aols existing customer highspeed broadband timewarner also restate result following probe u security exchange commission sec close concluding time warner fourth quarter profit slightly better analyst expectation film division saw profit slump helped boxoffice flop alexander catwoman sharp contrast yearearlier third 

In [28]:
# Save TF-IDF vectorizer used for training the classifier
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']