# **Assignment 2: NLP - News Article Classification**

## **Importing Libraries**

In [63]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import spacy

## **Dataset**

In [28]:
df = pd.read_csv('BBC News.csv')
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [30]:
df.describe(exclude=np.number)

Unnamed: 0,Text,Category
count,1490,1490
unique,1440,5
top,microsoft seeking spyware trojan microsoft is ...,sport
freq,2,346


In [31]:
df['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [32]:
df['ArticleId'].nunique()

1490

In [33]:
df.drop('ArticleId', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      1490 non-null   object
 1   Category  1490 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


## **Preprocessing**

In [34]:
def blankRows(data):
    blank = []
    for idx, text, label in data.itertuples():
        if text.isspace():
            blank.append(idx)
    return blank

In [35]:
blankRows(df)

[]

No rows with blank or whitespace only articles in the dataset.

In [36]:
nlp = spacy.load('en_core_web_sm')

In [37]:
df['Text'][0]



In [38]:
nlp(df['Text'][0])



In [39]:
# remove stop words and lemmatize the text
def preprocessText(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not(token.is_stop or token.is_punct or token.is_space):
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [40]:
preprocessText(df['Text'][0])



In [41]:
df['preprocessed_text'] = df['Text'].apply(preprocessText)

In [42]:
df.head()

Unnamed: 0,Text,Category,preprocessed_text
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defend ...
1,german business confidence slides german busin...,business,german business confidence slide german busine...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle govern mobile choice fast well funky...
4,enron bosses in $168m payout eighteen former e...,business,enron boss $ 168 m payout eighteen enron direc...


## **Dependent and Independent**

In [43]:
X = df['preprocessed_text']
y = df['Category']

In [44]:
# label encoding of dependent value (Categories)
le = LabelEncoder()
y = le.fit_transform(y)

In [45]:
y

array([0, 0, 0, ..., 0, 4, 4])

## **Splitting of Dataset**

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1192,), (298,), (1192,), (298,))

## **Model Building**

### **Mutlinomial Naive Bayes**

In [48]:
model1 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                   ('bayes_model', MultinomialNB())
                   ])

In [49]:
model1.fit(X_train, y_train)
model1_pred = model1.predict(X_test)

In [50]:
accuracy_score(y_test, model1_pred)

0.9798657718120806

In [54]:
print(classification_report(model1_pred, y_test, target_names=df['Category'].unique()))

               precision    recall  f1-score   support

     business       0.97      0.99      0.98        74
         tech       0.96      1.00      0.98        44
     politics       0.98      0.95      0.96        58
        sport       1.00      1.00      1.00        63
entertainment       0.98      0.97      0.97        59

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298



### **Decision Tree**

In [57]:
model2 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                   ('bayes_model', DecisionTreeClassifier())
                   ])

In [58]:
model2.fit(X_train, y_train)
model2_pred = model2.predict(X_test)

In [59]:
accuracy_score(y_test, model2_pred)

0.7986577181208053

In [60]:
print(classification_report(model2_pred, y_test, target_names=df['Category'].unique()))

               precision    recall  f1-score   support

     business       0.84      0.74      0.79        85
         tech       0.72      0.75      0.73        44
     politics       0.77      0.77      0.77        56
        sport       0.94      0.88      0.91        67
entertainment       0.69      0.87      0.77        46

     accuracy                           0.80       298
    macro avg       0.79      0.80      0.79       298
 weighted avg       0.81      0.80      0.80       298



### **Random Forest Classifier**

In [65]:
model3 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                   ('bayes_model', RandomForestClassifier(n_estimators=200))
                   ])

In [66]:
model3.fit(X_train, y_train)
model3_pred = model3.predict(X_test)

In [67]:
accuracy_score(y_test, model3_pred)

0.9630872483221476

In [68]:
print(classification_report(model3_pred, y_test, target_names=df['Category'].unique()))

               precision    recall  f1-score   support

     business       0.99      0.94      0.96        79
         tech       0.98      0.94      0.96        48
     politics       0.96      0.96      0.96        56
        sport       1.00      0.98      0.99        64
entertainment       0.88      1.00      0.94        51

     accuracy                           0.96       298
    macro avg       0.96      0.96      0.96       298
 weighted avg       0.97      0.96      0.96       298



#### Among the three models, Mutinomial Naive Bayes gives the best accuracy of 98%.