In [60]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy

nlp = spacy.load('en_core_web_sm')

In [61]:


v = CountVectorizer(ngram_range=(2,3))
v.fit(['Thor is looking for a job'])
print(v.vocabulary_)

{'thor is': 5, 'is looking': 1, 'looking for': 3, 'for job': 0, 'thor is looking': 6, 'is looking for': 2, 'looking for job': 4}


In [62]:
corpus = [
    'Thor is eating pizza',
    'Loki is tall',
    'Loki ate pizza'
]

Creating a preprocessing function - Removing stopwords and Punctuations

In [63]:
doc = nlp('I is went to marketing the market yesterday, while purchasing food, i fell.')
for token in doc:
    if token.is_stop or token.is_punct:
        continue #ignore the stopwords and punctuations
    print(token.lemma_)

go
market
market
yesterday
purchase
food
fall


In [64]:
def preprocess(text):
    doc = nlp(text) #split the words in the text
    
    processed_text = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue #ignore the stopwords and punctuations
        
        processed_text.append(token.lemma_)
        
    return ' '.join(processed_text)
    

In [65]:
preprocess(corpus[1])

'Loki tall'

In [66]:
processed_corpus = [preprocess(i) for i in corpus]
processed_corpus

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [67]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(processed_corpus)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [68]:
test = v.transform(['I eat pizza'])
test.toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

### **CLASSIFYING NEWS CATEGORIES**

In [69]:
import pandas as pd

df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df = df[['headline', 'category']]

In [70]:
df.head()

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [71]:
politics = df[df['category'] == 'POLITICS'].sample(3000)
sports = df[df['category'] == 'SPORTS'].sample(3000)
business = df[df['category'] == 'BUSINESS'].sample(3000)
crime = df[df['category'] == 'CRIME'].sample(3000)

data = pd.concat([politics, sports, business, crime], axis=0)

In [72]:
data.category.value_counts()

POLITICS    3000
SPORTS      3000
BUSINESS    3000
CRIME       3000
Name: category, dtype: int64

In [73]:
# Encoding the category column

target = {'POLITICS':0, 'CRIME':1, 'SPORTS':2, 'BUSINESS':3}
data['category_num'] = data.category.map(target)

## Training the model

In [74]:
from sklearn.model_selection import train_test_split

x = data['headline']
y = data['category_num']

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify = data.category_num)

In [75]:
y_train.value_counts()

0    2400
2    2400
3    2400
1    2400
Name: category_num, dtype: int64

## Creating a pipeline for the model building

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [77]:
def pipe(model):
    clf = Pipeline([
        ('vector', CountVectorizer()),
        ('classifier', model)
    ])
    
    clf.fit(x_train, y_train)
    
    pred = clf.predict(x_test)
    
    return print(classification_report(y_test, pred))

In [78]:
# NAIVE BAYES CLASSIFICATION
pipe(MultinomialNB())

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       600
           1       0.86      0.84      0.85       600
           2       0.90      0.84      0.87       600
           3       0.82      0.81      0.82       600

    accuracy                           0.84      2400
   macro avg       0.84      0.84      0.84      2400
weighted avg       0.84      0.84      0.84      2400



In [79]:
# RANDOM FOREST CLASSIFICATION

pipe(RandomForestClassifier())

              precision    recall  f1-score   support

           0       0.84      0.69      0.76       600
           1       0.80      0.81      0.80       600
           2       0.90      0.67      0.77       600
           3       0.60      0.86      0.71       600

    accuracy                           0.76      2400
   macro avg       0.79      0.76      0.76      2400
weighted avg       0.79      0.76      0.76      2400



In [80]:
# DECISION TREE CLASSIFICATION

pipe(DecisionTreeClassifier())

              precision    recall  f1-score   support

           0       0.73      0.66      0.70       600
           1       0.75      0.72      0.74       600
           2       0.75      0.65      0.70       600
           3       0.57      0.72      0.64       600

    accuracy                           0.69      2400
   macro avg       0.70      0.69      0.69      2400
weighted avg       0.70      0.69      0.69      2400



### **EXPERIMENTING THE MODEL ON PREPROCESSED TEXT**

In [81]:
data['processed_text'] = df.headline.apply(preprocess)

In [None]:
data.head()

In [None]:
x = data['preprocessd_text']
y = data['category_num']

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify = data.category_num)

In [None]:
def pipe(model):
    clf = Pipeline([
        ('vector', CountVectorizer()),
        ('classifier', model)
    ])
    
    clf.fit(x_train, y_train)
    
    pred = clf.predict(x_test)
    
    return print(classification_report(y_test, pred))

In [None]:
# NAIVE BAYES CLASSIFICATION
pipe(MultinomialNB())