In [None]:
# Tutorial URL: https://www.analyticsvidhya.com/blog/2021/12/text-classification-of-news-articles/
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

In [2]:
df = pd.read_csv("./dataset/BBC News Train.csv")

In [3]:
df["CategoryId"] = df["Category"].factorize()[0]
df["CategoryId"]

0       0
1       0
2       0
3       1
4       0
       ..
1485    4
1486    4
1487    0
1488    1
1489    1
Name: CategoryId, Length: 1490, dtype: int64

In [4]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)

def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews

def convert_lower(text):
   return text.lower()

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]

def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])

In [5]:
df["Text"] = df.Text.apply(remove_tags)
df["Text"] = df.Text.apply(special_char)
df["Text"] = df.Text.apply(convert_lower)
df["Text"] = df.Text.apply(remove_stopwords)
df["Text"] = df.Text.apply(lemmatize_word)
df["Text"]

0       worldcom ex bos launch defence lawyer defendin...
1       german business confidence slide german busine...
2       bbc poll indicates economic gloom citizen majo...
3       lifestyle governs mobile choice faster better ...
4       enron boss 168m payout eighteen former enron d...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart show dj duo jk joel...
1487    weak dollar hit reuters revenue medium group r...
1488    apple ipod family expands market apple expande...
1489    santy worm make unwelcome visit thousand websi...
Name: Text, Length: 1490, dtype: object

In [6]:
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df["Text"]).toarray()
y = df["CategoryId"].to_numpy()
print(x.shape)
print(y.shape)

(1490, 5000)
(1490,)


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42, shuffle=True)
print(len(x_train))
print(len(x_test))

1043
447


In [8]:
perform_list = [] #a list of model and accuracy dicts
def run_model(x_train, x_test, y_train, y_test, model_name, est_c, est_pnlty):
  mdl = ""
  if model_name == "Logistic Regression":
    mdl = LogisticRegression()
  elif model_name == "Random Forest":
    mdl = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=10)
  elif model_name == "Multinomial Naive Bayes":
    mdl = MultinomialNB(alpha=1.0, fit_prior=True)
  elif model_name == "Support Vector Classifer":
    mdl = SVC()
  elif model_name == "Decision Tree Classifier":
    mdl = DecisionTreeClassifier()
  elif model_name == "K Nearest Neighbour":
    mdl = KNeighborsClassifier(n_neighbors=10, metric="minkowski", p=4)
  elif model_name == "Gaussian Naive Bayes":
    mdl = GaussianNB()
  
  oneVsRest = OneVsRestClassifier(mdl)
  oneVsRest.fit(x_train, y_train)
  y_pred = oneVsRest.predict(x_test)

  # Performance metrics
  accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

  # Get precision, recall, f1 scores
  precision, recall, f1score, support = score(y_test, y_pred, average="micro")
  print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
  print(f'Precision : {precision}')
  print(f'Recall : {recall}')
  print(f'F1-score : {f1score}')

  # Add performance parameters to list
  perform_list.append(dict([
    ('Model', model_name),
    ('Test Accuracy', round(accuracy, 2)),
    ('Precision', round(precision, 2)),
    ('Recall', round(recall, 2)),
    ('F1', round(f1score, 2))
  ]))

In [9]:
run_model(x_train, x_test, y_train, y_test, "Logistic Regression", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Logistic Regression: % 96.64
Precision : 0.9664429530201343
Recall : 0.9664429530201343
F1-score : 0.9664429530201343


In [10]:
run_model(x_train, x_test, y_train, y_test, "Random Forest", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Random Forest: % 97.99
Precision : 0.9798657718120806
Recall : 0.9798657718120806
F1-score : 0.9798657718120806


In [11]:
run_model(x_train, x_test, y_train, y_test, "Multinomial Naive Bayes", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Multinomial Naive Bayes: % 97.76
Precision : 0.9776286353467561
Recall : 0.9776286353467561
F1-score : 0.9776286353467561


In [12]:
run_model(x_train, x_test, y_train, y_test, "Support Vector Classifer", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Support Vector Classifer: % 95.97
Precision : 0.959731543624161
Recall : 0.959731543624161
F1-score : 0.959731543624161


In [13]:
run_model(x_train, x_test, y_train, y_test, "Decision Tree Classifier", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Decision Tree Classifier: % 81.21
Precision : 0.8120805369127517
Recall : 0.8120805369127517
F1-score : 0.8120805369127517


In [14]:
run_model(x_train, x_test, y_train, y_test, "K Nearest Neighbour", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic K Nearest Neighbour: % 71.59
Precision : 0.7158836689038032
Recall : 0.7158836689038032
F1-score : 0.7158836689038032


In [15]:
run_model(x_train, x_test, y_train, y_test, "Gaussian Naive Bayes", est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Gaussian Naive Bayes: % 78.52
Precision : 0.785234899328859
Recall : 0.785234899328859
F1-score : 0.7852348993288589


In [16]:
model_performance = pd.DataFrame(perform_list)
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Logistic Regression,96.64,0.97,0.97,0.97
1,Random Forest,97.99,0.98,0.98,0.98
2,Multinomial Naive Bayes,97.76,0.98,0.98,0.98
3,Support Vector Classifer,95.97,0.96,0.96,0.96
4,Decision Tree Classifier,81.21,0.81,0.81,0.81
5,K Nearest Neighbour,71.59,0.72,0.72,0.72
6,Gaussian Naive Bayes,78.52,0.79,0.79,0.79


In [17]:
best_model = model_performance[model_performance["Test Accuracy"] == model_performance["Test Accuracy"].max()].iloc[0]
print("~Best model")
print("Model:", best_model["Model"])
print("Accuracy:", best_model["Test Accuracy"])

~Best model
Model: Random Forest
Accuracy: 97.99


In [18]:
classifier = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=42).fit(x_train, y_train)
y_pred = classifier.predict(x_test)
y_pred

array([4, 2, 2, 1, 1, 0, 1, 4, 0, 1, 2, 4, 0, 3, 1, 0, 0, 3, 2, 0, 1, 4,
       3, 0, 3, 1, 0, 3, 1, 0, 3, 0, 0, 2, 3, 1, 4, 0, 4, 0, 3, 0, 2, 0,
       3, 0, 2, 1, 2, 0, 3, 3, 0, 3, 2, 4, 4, 2, 0, 4, 1, 0, 4, 1, 1, 1,
       0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 0, 2, 0, 0, 0, 3, 3, 3, 0, 4, 3, 0,
       0, 1, 4, 2, 2, 1, 0, 2, 1, 1, 4, 2, 4, 2, 4, 3, 1, 3, 2, 3, 0, 1,
       1, 0, 1, 4, 1, 0, 2, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 4, 0, 3,
       4, 3, 0, 2, 0, 4, 0, 4, 0, 4, 1, 3, 4, 2, 0, 1, 2, 2, 3, 0, 1, 3,
       4, 1, 4, 3, 1, 3, 3, 2, 0, 1, 2, 2, 1, 3, 3, 1, 2, 1, 1, 4, 0, 1,
       4, 3, 0, 0, 3, 3, 1, 4, 4, 0, 1, 2, 4, 2, 3, 3, 2, 1, 3, 3, 2, 3,
       3, 0, 4, 2, 3, 4, 0, 3, 1, 1, 0, 0, 3, 1, 0, 4, 3, 2, 2, 3, 1, 2,
       3, 1, 0, 0, 4, 3, 0, 2, 2, 2, 2, 3, 3, 0, 0, 1, 0, 3, 3, 2, 2, 4,
       4, 4, 0, 0, 4, 4, 3, 4, 0, 2, 1, 3, 2, 1, 4, 3, 1, 0, 0, 2, 4, 1,
       1, 3, 3, 0, 2, 4, 3, 3, 1, 3, 3, 3, 3, 3, 0, 2, 4, 0, 0, 2, 0, 0,
       0, 1, 2, 4, 0, 0, 0, 0, 2, 4, 1, 4, 3, 0, 2,

In [19]:
x_pred = cv.transform(["Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin"])
y_pred = classifier.predict(x_pred)
print("Predicted Category:", df[df["CategoryId"] == y_pred[0]].iloc[0]["Category"].title())

Predicted Category: Sport
