In [51]:
from sklearn.feature_extraction.text import CountVectorizer


In [52]:
cv = CountVectorizer()
cv.fit(["Thor Hathodawala is looking for a job"])
cv.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [53]:
cv = CountVectorizer(ngram_range=(2,2))
cv.fit(["Thor Hathodawala is looking for a job"])
cv.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [54]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(["Thor Hathodawala is looking for a job"])
cv.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [55]:
# cv = CountVectorizer(ngram_range=(1,2,3))
# cv.fit(["Thor Hathodawala is looking for a job"])
# cv.vocabulary_

# # ERROR

In [56]:
cv = CountVectorizer(ngram_range=(1,3))
cv.fit(["Thor Hathodawala is looking for a job"])
cv.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [57]:
# cv = CountVectorizer(ngram_range=(3,1))
# cv.fit(["Thor Hathodawala is looking for a job"])
# cv.vocabulary_

# # ERROR
# # MIN RANGE TO MAX RANGE REQUIRED

In [58]:
t_group = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [59]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tk = []
    for i in doc:
        if i.is_stop or i.is_punct:
            continue
        filtered_tk.append(i.lemma_)
    return " ".join(filtered_tk) 

In [60]:
preprocess(t_group[0])

'thor eat pizza'

In [61]:
preprocess(t_group[1])

'Loki tall'

In [62]:
preprocess(t_group[2])

'Loki eat pizza'

In [63]:
t_group_processed = [preprocess(i) for i in t_group]
t_group_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [64]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(t_group_processed)
cv.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [65]:
cv.transform(["Thor eat pizza"])

<1x9 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [66]:
cv.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [67]:
cv.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [87]:
import pandas as pd

df = pd.read_json("news_dataset.json")
df

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS


In [69]:
df['category'].value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [70]:
m_samples = 1500


df_business = df[df.category=="BUSINESS"].sample(m_samples, random_state=10)
df_sports = df[df.category=="SPORTS"].sample(m_samples, random_state=10)
df_crime = df[df.category=="CRIME"].sample(m_samples, random_state=10)
df_science = df[df.category=="SCIENCE"]

In [71]:
df = pd.concat([df_business,df_sports,df_crime,df_science],axis=0)
df.category.value_counts()

category
BUSINESS    1500
SPORTS      1500
CRIME       1500
SCIENCE     1381
Name: count, dtype: int64

In [72]:
df['category_label'] = df['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3
})

In [73]:
df

Unnamed: 0,text,category,category_label
4528,Impossible Goals and My Quest to Lose 175 Poun...,BUSINESS,0
3265,One Way Chipotle Has Completely Revolutionized...,BUSINESS,0
727,Because Sexual Abuse Is The Old Normal Don’t l...,BUSINESS,0
10317,The Secret To Building A Successful Business T...,BUSINESS,0
8127,Nike Is The Latest Company To Ramp Up Parental...,BUSINESS,0
...,...,...,...
12632,New Continent Zealandia Is Discovered Underwat...,SCIENCE,3
12644,Smartphone Lovers More Likely To Forget Things...,SCIENCE,3
12669,Ambitious Test On Tap For Real-Life 'Flying Sa...,SCIENCE,3
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE,3


In [74]:
from sklearn.model_selection import train_test_split

x = df['text']
y = df['category_label']

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=10,stratify=y)

In [76]:
y_train.value_counts()

category_label
2    1200
0    1200
1    1200
3    1104
Name: count, dtype: int64

In [77]:
y_test.value_counts()

category_label
1    300
2    300
0    300
3    277
Name: count, dtype: int64

In [78]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

USING NORMAL BAG OF WORDS

In [81]:
pipe1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('Naive_mod',MultinomialNB())
])

pipe1.fit(X_train,y_train)

y_pred = pipe1.predict(X_test)

c_report = classification_report(y_test,y_pred)
print(c_report)
print(pipe1.score(X_test,y_test))

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       300
           1       0.88      0.86      0.87       300
           2       0.87      0.89      0.88       300
           3       0.93      0.83      0.88       277

    accuracy                           0.87      1177
   macro avg       0.87      0.87      0.87      1177
weighted avg       0.87      0.87      0.87      1177

0.8657604078164826


USING BAG OF WORDS N GRAM 1,2

In [82]:
pipe2 = Pipeline([
    ('N_GRAM_1_2_vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('Naive_mod',MultinomialNB())
])

pipe2.fit(X_train,y_train)

y_pred = pipe2.predict(X_test)

c_report = classification_report(y_test,y_pred)
print(c_report)
print(pipe2.score(X_test,y_test))

# SLIGHT DROP IN PERFOMANCE 

              precision    recall  f1-score   support

           0       0.70      0.92      0.79       300
           1       0.89      0.82      0.85       300
           2       0.88      0.85      0.87       300
           3       0.95      0.76      0.85       277

    accuracy                           0.84      1177
   macro avg       0.86      0.84      0.84      1177
weighted avg       0.86      0.84      0.84      1177

0.8385726423109601


USING BAG OF WORDS N GRAM 1,3

In [83]:
pipe3 = Pipeline([
    ('N_GRAM_1_3_vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('Naive_mod',MultinomialNB())
])

pipe3.fit(X_train,y_train)

y_pred = pipe3.predict(X_test)

c_report = classification_report(y_test,y_pred)
print(c_report)
print(pipe3.score(X_test,y_test))

# SO HERE BAG OF WORDS ARE BETTER 

              precision    recall  f1-score   support

           0       0.69      0.93      0.79       300
           1       0.91      0.80      0.85       300
           2       0.89      0.85      0.87       300
           3       0.95      0.75      0.84       277

    accuracy                           0.84      1177
   macro avg       0.86      0.84      0.84      1177
weighted avg       0.86      0.84      0.84      1177

0.836873406966865


APPLYING PREPROCESSING LIKE LEMMATIZATION, REMOVING PUNCTUATION, REMOVING STOP WORDS

In [85]:
df['text'] = df['text'].apply(preprocess)

x = df['text']
y = df['category_label']

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=10,stratify=y)


pipe_process = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('Naive_mod',MultinomialNB())
])

pipe_process.fit(X_train,y_train)

y_pred = pipe_process.predict(X_test)

c_report = classification_report(y_test,y_pred)
print(c_report)
print(pipe_process.score(X_test,y_test))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       300
           1       0.90      0.88      0.89       300
           2       0.85      0.94      0.89       300
           3       0.92      0.85      0.89       277

    accuracy                           0.88      1177
   macro avg       0.89      0.88      0.88      1177
weighted avg       0.89      0.88      0.88      1177

0.8844519966015293


In [86]:
t_get = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

import pickle

with open("news_classifier.pkl", "wb") as f:
    pickle.dump((pipe_process, t_get), f)    