# Text Classification - BBC News Example

## 1. Environment Preparation

In [1]:
# Install and import packages

#!pip install pandas numpy openpyxl matplotlib seaborn scikit-learn yellowbrick nltk

import pandas as pd # working with data
import numpy as np # working with arrays
import matplotlib.pyplot as plt # data visualization
import seaborn as sb # data visualization

from sklearn.model_selection import train_test_split, KFold, cross_val_score # data spliting and k-cross validation

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from yellowbrick.style import set_palette
from yellowbrick.classifier import ClassificationReport, ROCAUC, ConfusionMatrix, ClassPredictionError

# NLP packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')

import pickle # mode saving

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Helper functions
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    en_stopwords = stopwords.words('english')
    # Remove punctuation, numbers, and special characters
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in en_stopwords]
    # Perform pos_tag to include nouns (NN, NNP, NNS) and verbs (VB, VBD, VBG, VBN, VBP, VBZ)
    words = [tagged_word[0] for tagged_word in pos_tag(words) if tagged_word[1].startswith('NN') or tagged_word[1].startswith('V') ]
    # Perform lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    # Remove stop words
    words = [word for word in words if word not in en_stopwords]
    # Remove words with length less than 2
    words = [word for word in words if len(word)>1]
    # Join the processed words back into a single string
    processed_text = ' '.join(words)
    return processed_text

## 2. Data Exploration

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load data into Panda's Dataframe
news_df = pd.read_excel('/content/drive/MyDrive/CAC_Workshop/CAC-TextMiningWorkshop-June2023/BBCNewsDataset.xlsx')

# Print information about the Panda Dataframe, including the number of records, number of columns, and data types
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      1440 non-null   object
 1   Category  1440 non-null   object
dtypes: object(2)
memory usage: 22.6+ KB


In [6]:
# Display the normalized counts of unique values of the label column
pd.DataFrame(news_df['Category'].value_counts(sort=True, normalize=True))

Unnamed: 0,Category
sport,0.2375
business,0.232639
politics,0.184722
entertainment,0.182639
tech,0.1625


In [7]:
# Specify features of interest
text_feature = 'Text'
processed_text_feature = text_feature + '_processed'
label = 'Category'

news_df.head(10)

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
5,howard truanted to play snooker conservative...,politics
6,wales silent on grand slam talk rhys williams ...,sport
7,french honour for director parker british film...,entertainment
8,car giant hit by mercedes slump a slump in pro...,business
9,fockers fuel festive film chart comedy meet th...,entertainment


## 3. Data Preparation

In [8]:
# Preprocess text
news_df[processed_text_feature] = news_df[text_feature].apply(lambda x: preprocess_text(x))
news_df.head(10)

Unnamed: 0,Text,Category,Text_processed
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launch defence lawyer defendin...
1,german business confidence slides german busin...,business,business confidence business confidence fell k...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates gloom citizen majority nati...
3,lifestyle governs mobile choice faster bett...,tech,governs choice hardware going help phone firm ...
4,enron bosses in $168m payout eighteen former e...,business,enron boss payout enron director agreed settle...
5,howard truanted to play snooker conservative...,politics,truanted play snooker leader michael admitted ...
6,wales silent on grand slam talk rhys williams ...,sport,wale slam talk rhys williams say wale thinking...
7,french honour for director parker british film...,entertainment,honour director parker film director sir alan ...
8,car giant hit by mercedes slump a slump in pro...,business,car giant hit mercedes slump profitability lux...
9,fockers fuel festive film chart comedy meet th...,entertainment,fockers fuel film chart comedy meet fockers to...


## 4. Train and Test Data Splits

In [9]:
# Select input features and target variable
X = news_df[processed_text_feature]  # Input features
y = news_df[label]  # Target variable

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Split the training set into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=5)

print ("Train dataset: {0}{1}".format(X_train.shape, y_train.shape))
print ("Validation dataset: {0}{1}".format(X_val.shape, y_val.shape))
print ("Test dataset: {0}{1}".format(X_test.shape, y_test.shape))

Train dataset: (921,)(921,)
Validation dataset: (231,)(231,)
Test dataset: (288,)(288,)


## 5. Models Training and Evaluation

In [10]:
# Define models
model_names = ['LogisticRegression', 'KNN', 'BernoulliNB', 'RandomForest']
models = []

models.append((LogisticRegression()))
models.append((KNeighborsClassifier(n_neighbors=3)))
models.append((BernoulliNB()))
models.append((RandomForestClassifier(n_estimators=10)))

In [11]:
# Run K-Cross validation to build the models and report some evaluation metrics to find the model with the highest performance
kfold = KFold(n_splits=10)
pipelines = []
accuracy_scores = {}

for i in range(0, len(models)):
  pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', models[i])])

  cv_result = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
  print ('-'*40)
  print ('{0}[Internal Validation Accuracy Scores]: {1}, {2}'.format(model_names[i], cv_result, round(cv_result.mean(), 5)))

  # Fit your model on the traning dataset
  pipeline.fit(X_train, y_train)
  pipelines.append(pipeline)

  prediction = pipeline.predict(X_val)
  acc_score = accuracy_score(y_val, prediction)
  print ('{0}[External Validation Accuracy Score]: {1}'.format(model_names[i], round(acc_score, 5)))

  prediction = pipeline.predict(X_test)
  acc_score = accuracy_score(y_test, prediction)
  accuracy_scores[model_names[i]] = acc_score
  print ('{0}[Test Accuracy Score]: {1}'.format(model_names[i], round(acc_score, 5)))
  print ('{0}[Test Confusion Matrix]:\n{1}'.format(model_names[i], confusion_matrix(y_test, prediction)))
  print ('{0}[Test Classification Matrix]:\n{1}'.format(model_names[i], classification_report(y_test, prediction)))

print(f"accuracy_scores: {accuracy_scores}")

----------------------------------------
LogisticRegression[Internal Validation Accuracy Scores]: [0.97849462 0.93478261 0.92391304 0.94565217 0.97826087 0.95652174
 0.9673913  0.9673913  0.98913043 0.94565217], 0.95872
LogisticRegression[External Validation Accuracy Score]: 0.92641
LogisticRegression[Test Accuracy Score]: 0.95139
LogisticRegression[Test Confusion Matrix]:
[[62  0  2  0  1]
 [ 0 58  1  0  0]
 [ 4  0 48  0  1]
 [ 0  0  0 66  0]
 [ 1  2  0  2 40]]
LogisticRegression[Test Classification Matrix]:
               precision    recall  f1-score   support

     business       0.93      0.95      0.94        65
entertainment       0.97      0.98      0.97        59
     politics       0.94      0.91      0.92        53
        sport       0.97      1.00      0.99        66
         tech       0.95      0.89      0.92        45

     accuracy                           0.95       288
    macro avg       0.95      0.95      0.95       288
 weighted avg       0.95      0.95      0.9

## 6. Best Model Selection

In [12]:
# Keep the best model
bestModel_idx = 0
bestModel = pipelines[bestModel_idx]
prediction = bestModel.predict(X_test)
print ('{0}[Test Confusion Matrix]:\n{1}'.format(model_names[bestModel_idx], confusion_matrix(y_test, prediction)))
print ('{0}[Test Classification Matrix]:\n{1}'.format(model_names[bestModel_idx], classification_report(y_test, prediction)))

LogisticRegression[Test Confusion Matrix]:
[[62  0  2  0  1]
 [ 0 58  1  0  0]
 [ 4  0 48  0  1]
 [ 0  0  0 66  0]
 [ 1  2  0  2 40]]
LogisticRegression[Test Classification Matrix]:
               precision    recall  f1-score   support

     business       0.93      0.95      0.94        65
entertainment       0.97      0.98      0.97        59
     politics       0.94      0.91      0.92        53
        sport       0.97      1.00      0.99        66
         tech       0.95      0.89      0.92        45

     accuracy                           0.95       288
    macro avg       0.95      0.95      0.95       288
 weighted avg       0.95      0.95      0.95       288

