In [1]:
import pandas as pd
import numpy as np

### Loading the data

In [2]:
userInput = 'Apple WWDC 2022 highlights: M2 chip with neural engine, search updates, APIs and more'
#userInput = input("Enter a Text: ")

In [3]:
#data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv", sep='\t')
data = pd.read_csv('NewsDump.csv')
print(data.head())

                                               title       category
0  ED attaches properties of designer Ritu Beri’s...       Business
1  Disha Vakani won't return to Taarak Mehta Ka O...  Entertainment
2  Bitcoin price rises over $30,000, other crypto...     Technology
3  Real Madrid extend Luka Modric's contract unti...         Sports
4  Mysuru nun alleges sexual harassment, says lif...          Crime


In [4]:
# Check for null values

data.isnull().sum()

title       0
category    0
dtype: int64

In [5]:
# Get a count for each types

data["category"].value_counts()

Business         175
Technology        98
Entertainment     80
Sports            43
World News        40
Crime              7
Name: category, dtype: int64

### Data pre-processing

In [6]:
data = data[["title", "category"]]   # Considering only two columns, in case other columns are present in the file

x = np.array(data["title"])
y = np.array(data["category"])

In [7]:
# CountVectorizer is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(x)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Multinomial Naive Bayes

### Train the model & Predict the outcome

In [9]:
# Using Multinomial Naive Bayes algorithm to train a news classification model
from sklearn.naive_bayes import MultinomialNB

classifier_MNB = MultinomialNB()
classifier_MNB.fit(X_train,y_train)

MultinomialNB()

In [10]:
inputTrans = cv.transform([userInput]).toarray()
output = classifier_MNB.predict(inputTrans)

print(output)

['Technology']


In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier_MNB.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[42  0  2  1  3  1]
 [ 4  0  1  0  0  0]
 [ 7  0 13  0  4  1]
 [ 4  0  2  6  1  1]
 [ 5  0  4  1 25  3]
 [ 4  0  3  1  2  6]]


0.6258503401360545

## Random Forest Classification

In [12]:
from sklearn.ensemble import RandomForestClassifier

classifier_RFC = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_RFC.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [13]:
inputTrans = cv.transform([userInput]).toarray()
output = classifier_RFC.predict(inputTrans)

print(output)

['Technology']


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier_RFC.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[48  0  0  0  1  0]
 [ 5  0  0  0  0  0]
 [19  0  5  1  0  0]
 [13  0  0  1  0  0]
 [31  0  2  0  5  0]
 [14  0  0  0  1  1]]


0.40816326530612246

## Kernel SVM model

In [15]:
from sklearn.svm import SVC
classifier_KSVM = SVC(kernel = 'rbf', random_state = 0)
classifier_KSVM.fit(X_train, y_train)

SVC(random_state=0)

In [16]:
#userInput = input("Enter a Text: ")
inputTrans = cv.transform([userInput]).toarray()
output = classifier_KSVM.predict(inputTrans)

print(output)

['Technology']


In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier_KSVM.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[48  0  0  0  1  0]
 [ 3  0  0  0  2  0]
 [19  0  3  0  3  0]
 [12  0  0  0  2  0]
 [25  0  0  0 13  0]
 [12  0  0  0  4  0]]


0.43537414965986393

## Decision Tree Classification

In [18]:
from sklearn.tree import DecisionTreeClassifier
classifier_DTC = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_DTC.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [19]:
inputTrans = cv.transform([userInput]).toarray()
output = classifier_DTC.predict(inputTrans)

print(output)

['Technology']


In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier_DTC.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[46  0  2  1  0  0]
 [ 2  0  0  3  0  0]
 [15  0  5  3  1  1]
 [ 8  0  1  4  1  0]
 [24  0  1  1 11  1]
 [11  0  2  1  1  1]]


0.4557823129251701