## News Articles Classification using NLP

In [1]:
## import dependencies

import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

## Spacy model

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
data = pd.read_csv('BBC News.csv')
data.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [6]:
data.isna().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [7]:
data.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [8]:
data.shape

(1490, 3)

In [9]:
data.ArticleId.nunique()

1490

In [10]:
## drop ArticleId column
df = data.drop('ArticleId', axis=1)

### Preprocessing using Spacy

In [11]:
def is_whitespace(data):
    
    blank = []
    for idx, text, label in data.itertuples():
        if text.isspace():
            blank.append(idx)
    
    return blank

In [12]:
is_whitespace(df)

[]

In [13]:
 # remove stop words and lemmatize the text

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [14]:
df['processed_text'] = df['Text'].apply(preprocess)

In [15]:
df.head()

Unnamed: 0,Text,Category,processed_text
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defend ...
1,german business confidence slides german busin...,business,german business confidence slide german busine...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle govern mobile choice fast well funky...
4,enron bosses in $168m payout eighteen former e...,business,enron boss $ 168 m payout eighteen enron direc...


In [16]:
df.Text[0]



In [17]:
df.processed_text[0]



#### Label Encoding and Data Splitting

In [18]:
le = LabelEncoder()
cat_fit = le.fit(df.Category)
y = cat_fit.transform(df.Category)

In [19]:
# Train-Test split

X_train, X_test, y_train, y_test = train_test_split(df.processed_text, y,test_size=0.2, random_state=42)

## **Data** **Modelling**

### CountVectorizer

In [20]:
model1 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                      ('bayes_model', MultinomialNB())])

In [21]:
model1.fit(X_train, y_train)

Pipeline(steps=[('c_vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('bayes_model', MultinomialNB())])

In [22]:
model1_pred = model1.predict(X_test)

In [23]:
print(f'Accuracy score of count_vectorizer based model: {accuracy_score(y_test, model1_pred):.2f}')

Accuracy score of count_vectorizer based model: 0.98


## TF-IDF Vectorizer

In [24]:
model2 = Pipeline([('t_vector', TfidfVectorizer()), 
                    ('bayes_model_2', MultinomialNB())])

In [25]:
model2.fit(X_train, y_train)

Pipeline(steps=[('t_vector', TfidfVectorizer()),
                ('bayes_model_2', MultinomialNB())])

In [26]:
model2_pred = model2.predict(X_test)

In [27]:
print(f'Accuracy score of tfidf based model: {accuracy_score(y_test, model2_pred):.2f}')

Accuracy score of tfidf based model: 0.96
