# Preprocessing Phase

In [1]:
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/victor/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/victor/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/victor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/victor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/victor/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to
[nltk_data]

True

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st
import re
from nltk import PorterStemmer, WordNetLemmatizer

# Input data files are available in the read-only "./input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Read the data. Here it is already in .csv format.
train_data = pd.read_csv('dataset/BBC News Train.csv')
train_data.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
5,1582,howard truanted to play snooker conservative...,politics
6,651,wales silent on grand slam talk rhys williams ...,sport
7,1797,french honour for director parker british film...,entertainment
8,2034,car giant hit by mercedes slump a slump in pro...,business
9,1866,fockers fuel festive film chart comedy meet th...,entertainment


In [4]:
# Read the data. Here it is already in .csv format.
test_data = pd.read_csv('dataset/BBC News Test.csv')
test_data.head(10)

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...
5,51,lewsey puzzle over disallowed try england s jo...
6,2025,blair blasts tory spending plans tony blair ha...
7,1479,former ni minister scott dies former northern ...
8,27,career honour for actor dicaprio actor leonard...
9,397,tsunami to hit sri lanka banks sri lanka s b...


In [5]:
train_data.shape

(1490, 3)

In [6]:
test_data.shape

(735, 2)

# Text cleaning and processing steps
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vectors

In [7]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [8]:
train_data['removed_punc'] = train_data['Text'].apply(lambda x: remove_punct(x))
train_data.head()

Unnamed: 0,ArticleId,Text,Category,removed_punc
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...


In [9]:
test_data['removed_punc'] = test_data['Text'].apply(lambda x: remove_punct(x))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...


In [10]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [11]:
train_data['tokens'] = train_data['removed_punc'].apply(lambda msg : tokenize(msg))
train_data.head()

Unnamed: 0,ArticleId,Text,Category,removed_punc,tokens
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,"[worldcom, exboss, launches, defence, lawyers,..."
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,"[enron, bosses, in, 168m, payout, eighteen, fo..."


In [12]:
test_data['tokens'] = test_data['removed_punc'].apply(lambda msg : tokenize(msg))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc,tokens
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, heads, for, preston, queens..."
1,1319,software watching while you work software that...,software watching while you work software that...,"[software, watching, while, you, work, softwar..."
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...,"[d, arcy, injury, adds, to, ireland, woe, gord..."
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...,"[india, s, reliance, family, feud, heats, up, ..."
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles..."


In [13]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [14]:
train_data['larger_tokens'] = train_data['tokens'].apply(lambda x : remove_small_words(x))
train_data.head()

Unnamed: 0,ArticleId,Text,Category,removed_punc,tokens,larger_tokens
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,"[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,..."
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize...","[poll, indicates, economic, gloom, citizens, m..."
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former..."


In [15]:
test_data['larger_tokens'] = test_data['tokens'].apply(lambda x : remove_small_words(x))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc,tokens,larger_tokens
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, heads, for, preston, queens...","[keeper, heads, preston, queens, park, rangers..."
1,1319,software watching while you work software that...,software watching while you work software that...,"[software, watching, while, you, work, softwar...","[software, watching, while, work, software, th..."
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...,"[d, arcy, injury, adds, to, ireland, woe, gord...","[arcy, injury, adds, ireland, gordon, arcy, be..."
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...,"[india, s, reliance, family, feud, heats, up, ...","[india, reliance, family, feud, heats, ongoing..."
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles..."


In [16]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [17]:
train_data['clean_tokens'] = train_data['larger_tokens'].apply(lambda x : remove_stopwords(x))
train_data.head()

Unnamed: 0,ArticleId,Text,Category,removed_punc,tokens,larger_tokens,clean_tokens
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,"[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,..."
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize...","[poll, indicates, economic, gloom, citizens, m...","[poll, indicates, economic, gloom, citizens, m..."
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former..."


In [18]:
test_data['clean_tokens'] = test_data['larger_tokens'].apply(lambda x : remove_stopwords(x))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc,tokens,larger_tokens,clean_tokens
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, heads, for, preston, queens...","[keeper, heads, preston, queens, park, rangers...","[keeper, heads, preston, queens, park, rangers..."
1,1319,software watching while you work software that...,software watching while you work software that...,"[software, watching, while, you, work, softwar...","[software, watching, while, work, software, th...","[software, watching, work, software, monitor, ..."
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...,"[d, arcy, injury, adds, to, ireland, woe, gord...","[arcy, injury, adds, ireland, gordon, arcy, be...","[arcy, injury, adds, ireland, gordon, arcy, ru..."
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...,"[india, s, reliance, family, feud, heats, up, ...","[india, reliance, family, feud, heats, ongoing...","[india, reliance, family, feud, heats, ongoing..."
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles..."


### Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes to get the root word.


In [19]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [20]:
train_data['lemma_words'] = train_data['clean_tokens'].apply(lambda x : lemmatize(x))
train_data.head()

Unnamed: 0,ArticleId,Text,Category,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,"[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launch, defence, lawyer, de..."
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slide, german, ..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize...","[poll, indicates, economic, gloom, citizens, m...","[poll, indicates, economic, gloom, citizens, m...","[poll, indicates, economic, gloom, citizen, ma..."
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former...","[enron, boss, 168m, payout, eighteen, former, ..."


In [21]:
test_data['lemma_words'] = test_data['clean_tokens'].apply(lambda x : lemmatize(x))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, heads, for, preston, queens...","[keeper, heads, preston, queens, park, rangers...","[keeper, heads, preston, queens, park, rangers...","[keeper, head, preston, queen, park, ranger, k..."
1,1319,software watching while you work software that...,software watching while you work software that...,"[software, watching, while, you, work, softwar...","[software, watching, while, work, software, th...","[software, watching, work, software, monitor, ...","[software, watching, work, software, monitor, ..."
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...,"[d, arcy, injury, adds, to, ireland, woe, gord...","[arcy, injury, adds, ireland, gordon, arcy, be...","[arcy, injury, adds, ireland, gordon, arcy, ru...","[arcy, injury, add, ireland, gordon, arcy, rul..."
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...,"[india, s, reliance, family, feud, heats, up, ...","[india, reliance, family, feud, heats, ongoing...","[india, reliance, family, feud, heats, ongoing...","[india, reliance, family, feud, heat, ongoing,..."
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles..."


In [22]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [23]:
train_data['clean_text'] = train_data['lemma_words'].apply(lambda x : return_sentences(x))
train_data.head()


Unnamed: 0,ArticleId,Text,Category,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words,clean_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,"[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launches, defence, lawyers,...","[worldcom, exboss, launch, defence, lawyer, de...",worldcom exboss launch defence lawyer defendin...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slide, german, ...",german business confidence slide german busine...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize...","[poll, indicates, economic, gloom, citizens, m...","[poll, indicates, economic, gloom, citizens, m...","[poll, indicates, economic, gloom, citizen, ma...",poll indicates economic gloom citizen majority...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...",lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former...","[enron, boss, 168m, payout, eighteen, former, ...",enron boss 168m payout eighteen former enron d...


In [24]:
test_data['clean_text'] = test_data['lemma_words'].apply(lambda x : return_sentences(x))
test_data.head()

Unnamed: 0,ArticleId,Text,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words,clean_text
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, heads, for, preston, queens...","[keeper, heads, preston, queens, park, rangers...","[keeper, heads, preston, queens, park, rangers...","[keeper, head, preston, queen, park, ranger, k...",keeper head preston queen park ranger keeper c...
1,1319,software watching while you work software that...,software watching while you work software that...,"[software, watching, while, you, work, softwar...","[software, watching, while, work, software, th...","[software, watching, work, software, monitor, ...","[software, watching, work, software, monitor, ...",software watching work software monitor every ...
2,1138,d arcy injury adds to ireland woe gordon d arc...,d arcy injury adds to ireland woe gordon d arc...,"[d, arcy, injury, adds, to, ireland, woe, gord...","[arcy, injury, adds, ireland, gordon, arcy, be...","[arcy, injury, adds, ireland, gordon, arcy, ru...","[arcy, injury, add, ireland, gordon, arcy, rul...",arcy injury add ireland gordon arcy ruled irel...
3,459,india s reliance family feud heats up the ongo...,india s reliance family feud heats up the ongo...,"[india, s, reliance, family, feud, heats, up, ...","[india, reliance, family, feud, heats, ongoing...","[india, reliance, family, feud, heats, ongoing...","[india, reliance, family, feud, heat, ongoing,...",india reliance family feud heat ongoing public...
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...","[boro, suffer, morrison, injury, blow, middles...",boro suffer morrison injury blow middlesbrough...


# Model and Evaluation Phase

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [26]:
#X_train, X_test, y_train, y_test = train_test_split(data['text'], data['category'], test_size=0.2, random_state=42)
X_train = train_data.pop("clean_text")
X_test = test_data.pop("clean_text")
y_train = train_data.pop("Category")
y_test = pd.read_csv('dataset/BBC News Sample Solution.csv')
y_test.pop("ArticleId")

0      1018
1      1319
2      1138
3       459
4      1020
       ... 
730    1923
731     373
732    1704
733     206
734     471
Name: ArticleId, Length: 735, dtype: int64

In [27]:
#vectorizer = CountVectorizer()
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

# OR

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [28]:
def train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test):
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    print(classifier.__class__.__name__)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

In [29]:
classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    LinearSVC(),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

for classifier in classifiers:
    train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test)

MultinomialNB
Accuracy: 0.19319727891156463
Classification Report:
                precision    recall  f1-score   support

     business       0.21      0.25      0.23       147
entertainment       0.21      0.15      0.17       147
     politics       0.17      0.17      0.17       147
        sport       0.20      0.23      0.22       147
         tech       0.18      0.16      0.17       147

     accuracy                           0.19       735
    macro avg       0.19      0.19      0.19       735
 weighted avg       0.19      0.19      0.19       735

Confusion Matrix:
 [[37 19 25 38 28]
 [37 22 34 30 24]
 [32 24 25 37 29]
 [40 17 25 34 31]
 [33 23 38 29 24]]


LogisticRegression
Accuracy: 0.19319727891156463
Classification Report:
                precision    recall  f1-score   support

     business       0.21      0.25      0.23       147
entertainment       0.21      0.16      0.18       147
     politics       0.16      0.16      0.16       147
        sport       0.20    