# Import Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load Dataset and check tha data imbalance

In [2]:
# Load your cleaned dataset
df = pd.read_csv('News_Data.csv')

# Preview
print(df['NewsCategory'].value_counts())
df.head(5)

NewsCategory
politics                  367
education                 364
technology                359
entertainment             352
crime_and_law             352
sports                    347
health_and_medicine       347
business_and_economics    338
Name: count, dtype: int64


Unnamed: 0,ID,NewsContent,NewsCategory
0,1,dragged by cops for questioning for being in a...,sports
1,2,prasidh krishna sai sudharsan sparkle as gujar...,sports
2,3,with the ball not coming on the wheels came of...,sports
3,4,gujarat titans choice to pick a blacksoil slug...,sports
4,5,the timeout at the end of the eight over of th...,sports


# TF-IDF Technique

In [3]:
tfidf = TfidfVectorizer(
    max_features=5000,          # Keep top 5000 important words
    stop_words='english',       # Remove English stopwords like "the", "is", etc.
    ngram_range=(1, 2)          # Include unigrams (1-word) and bigrams (2-word phrases)
)

# Apply TF-IDF to the News_content column
X = tfidf.fit_transform(df['NewsContent'])  # Converts text to sparse matrix

# Extract target labels
y = df['NewsCategory']

# Output the shape of the matrix
print("TF-IDF Matrix Shape:", X.shape)

TF-IDF Matrix Shape: (2826, 5000)


# Train And Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Build Multiple Models

## Support Vector Machine

In [5]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

## Naive Bayes Classifier

In [6]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

## Logistic Regression

In [7]:
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)

# Make Prediction

In [8]:
svm_pred = svm_model.predict(X_test)
nb_pred = nb_model.predict(X_test)
logreg_pred = logreg_model.predict(X_test)

# Evaluate the Models

In [9]:
def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluation for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted') * 100:.2f}%")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted') * 100:.2f}%")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted') * 100:.2f}%")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

## SVM-Evaluation

In [10]:
evaluate_model('SVM', y_test, svm_pred)

Evaluation for SVM:
Accuracy: 90.45%
Precision: 90.56%
Recall: 90.45%
F1 Score: 90.48%
Confusion Matrix:
 [[93  1  1  1  0  2  1  2]
 [ 0 96  2  0  1  6  0  1]
 [ 5  3 89  2  0  2  2  6]
 [ 0  1  2 98  0  1  2  2]
 [ 1  0  2  1 96  0  0  4]
 [ 1  5  5  1  0 98  0  0]
 [ 0  0  2  1  0  2 98  1]
 [ 1  2  5  0  0  0  1 99]]


## Naive-Bayes Evaluation

In [11]:
evaluate_model('Naive Bayes', y_test, nb_pred)

Evaluation for Naive Bayes:
Accuracy: 91.16%
Precision: 91.19%
Recall: 91.16%
F1 Score: 91.16%
Confusion Matrix:
 [[ 89   0   4   1   0   3   1   3]
 [  0  97   3   0   0   5   0   1]
 [  8   1  90   2   0   3   2   3]
 [  0   1   0 101   1   2   0   1]
 [  1   0   1   1  98   0   1   2]
 [  2   3   5   1   0  99   0   0]
 [  0   1   2   1   1   0  98   1]
 [  1   2   3   0   1   0   0 101]]


## Logistics Regression Evaluation

In [12]:
evaluate_model('Logistic Regression', y_test, logreg_pred)

Evaluation for Logistic Regression:
Accuracy: 90.09%
Precision: 90.26%
Recall: 90.09%
F1 Score: 90.10%
Confusion Matrix:
 [[ 91   1   2   1   0   3   0   3]
 [  0  95   3   1   0   6   0   1]
 [  8   1  86   1   0   3   2   8]
 [  0   4   0  95   0   3   1   3]
 [  1   0   0   0  99   0   1   3]
 [  2   3   5   1   1  98   0   0]
 [  0   0   2   1   0   1  98   2]
 [  1   1   3   0   1   0   0 102]]


# Testing with Today's News

## Insert the Today News

In [13]:
Input_count = int(input('Enter the input count --> '))
new_news_list = []
for recur in range(Input_count):
    news = input('\nPaste the content here: ')
    print('\n')
    if '"' in news:
       news.replace('\"','')
       new_news_list.append(news)
    else:
       new_news_list.append(news)

Enter the input count -->  1

Paste the content here:  India-U.K. goods trade surged 60% in eight years; imports nearly doubled Electrical machinery, N-reactors, boilers & machinery, mineral fuels & oils, pearls, precious & semi-precious stones, pharma make up nearly half of what India exports to the U.K.; machinery, engineering goods to see strong growth with FTA in place






## Classifying..

In [15]:
new_news_tfidf = tfidf.transform(new_news_list)

predicted_categories = nb_model.predict(new_news_tfidf)

for i, (news, category) in enumerate(zip(new_news_list, predicted_categories)):
    print(f"News {i+1} talks about \"{category.upper()}\"\n")
    print(new_news_list[i],"\n")
    print('*********************************************************************************************************************')

News 1 talks about "BUSINESS_AND_ECONOMICS"

India-U.K. goods trade surged 60% in eight years; imports nearly doubled Electrical machinery, N-reactors, boilers & machinery, mineral fuels & oils, pearls, precious & semi-precious stones, pharma make up nearly half of what India exports to the U.K.; machinery, engineering goods to see strong growth with FTA in place 

*********************************************************************************************************************
