**CLASSIFICATION OF ARTICLES**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
df = pd.read_csv('/content/Articles.csv', encoding='latin1')


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2692 entries, 0 to 2691
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Article   2692 non-null   object
 1   Date      2692 non-null   object
 2   Heading   2692 non-null   object
 3   NewsType  2692 non-null   object
dtypes: object(4)
memory usage: 84.2+ KB


In [None]:
df.head(10)

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business
5,New York: Oil prices tumbled Tuesday to fresh ...,1/7/2015,oil hits new 5.5 year lows as saudis defend,business
6,KARACHI: Strong bulls on Friday pulled the ben...,1/9/2015,bullish kse jumps over 33000 psychological bar...,business
7,"Singapore: Oil fell further in Asia Monday, wi...",1/12/2015,oil falls further in asian trad,business
8,KARACHI: Wholesale market rates for sugar drop...,1/13/2015,sugar prices drop to rs 49.80 in sind,business
9,SYDNEY: Oil prices fell 1 percent on Wednesday...,1/14/2015,oil extends losses as world bank cuts growth for,business


In [None]:
df.dropna(inplace=True)


In [None]:
x = df['Heading']
y = df['NewsType']


In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),  # Include unigrams and bigrams
    lowercase=True,  # Convert text to lowercase
    strip_accents='unicode',  # Strip accents from text
)

In [None]:
X = tfidf_vectorizer.fit_transform(x)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = MultinomialNB(alpha=0.1)


In [None]:
model.fit(X_train, y_train)


In [None]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

Cross-validation scores: [0.96660482 0.96660482 0.9535316  0.94052045 0.92936803]
Mean cross-validation score: 0.9513259443689609


In [None]:
y_pred = model.predict(X_test)


In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Test set accuracy:", accuracy)


Test set accuracy: 0.974025974025974


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)
classification_report_str = classification_report(y_test, y_pred, labels=model.classes_)
print("\nClassification Report:\n", classification_report_str)

Confusion matrix:
 [[256   6]
 [  8 269]]

Classification Report:
               precision    recall  f1-score   support

    business       0.97      0.98      0.97       262
      sports       0.98      0.97      0.97       277

    accuracy                           0.97       539
   macro avg       0.97      0.97      0.97       539
weighted avg       0.97      0.97      0.97       539



In [None]:
def predict_user_input():
    user_input = input("Enter a news headline: ")
    user_input_tfidf = tfidf_vectorizer.transform([user_input])
    predicted_category = model.predict(user_input_tfidf)
    print("Predicted category:", predicted_category[0])

# Predict category for user input
predict_user_input()

Enter a news headline: Stock market hits record high amid positive economic indicators
Predicted category: business
