In [65]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,3))
vectorizer.fit(["Thor Hatodwala is looking for a job"])
vectorizer.vocabulary_


{'thor': 12,
 'hatodwala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hatodwala': 13,
 'hatodwala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hatodwala is': 14,
 'hatodwala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [66]:
values = [
    "Aravinth ate pizza",
    "Jithu is tall",
    "Sugitha is eating pizza"
]

In [67]:
# we are going to preprocess of our values

import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)

    pre_processed_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        pre_processed_tokens.append(token.lemma_)

    return " ".join(pre_processed_tokens)

preprocess("Aravinth ate pizza")

'aravinth eat pizza'

In [68]:
values_preprocessed = [preprocess(text) for text in values]
values_preprocessed

['aravinth eat pizza', 'Jithu tall', 'Sugitha eat pizza']

In [69]:
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(values_preprocessed)
vectorizer.vocabulary_


{'aravinth': 0,
 'eat': 2,
 'pizza': 6,
 'aravinth eat': 1,
 'eat pizza': 3,
 'jithu': 4,
 'tall': 9,
 'jithu tall': 5,
 'sugitha': 7,
 'sugitha eat': 8}

In [70]:
# convert to a vector

tranformed_vectorizer = vectorizer.transform(["Blabla eat pizza"])
tranformed_vectorizer.toarray()


array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0]])

In [71]:
# news classification

import pandas as panda

dataframe = panda.read_json("E:\Project Learnings\ews_dataset.json")

print(dataframe.shape)

dataframe.head()

(12695, 2)


  dataframe = panda.read_json("E:\Project Learnings\ews_dataset.json")


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [72]:
# handle class imbalance

dataframe.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [73]:
min_samples = 1381
dataframe_business = dataframe[dataframe.category == "BUSINESS"].sample(min_samples, random_state=1)
dataframe_sports = dataframe[dataframe.category == "SPORTS"].sample(min_samples, random_state=1)
dataframe_crime = dataframe[dataframe.category == "CRIME"].sample(min_samples, random_state=1)
dataframe_science = dataframe[dataframe.category == "SCIENCE"].sample(min_samples, random_state=1)

In [74]:
dataframe_balanced = panda.concat([dataframe_business, dataframe_sports, dataframe_crime, dataframe_science])

dataframe_balanced.category.value_counts()



category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [81]:
#introducing 
dataframe_balanced["category_number"] = dataframe_balanced.category.map({
    "BUSINESS" : 0,
    "SPORTS": 1,
    "CRIME": 2,
    "SCIENCE": 3
})

dataframe_balanced.head()


Unnamed: 0,text,category,category_number
9625,Taking Your Startup Public Is Fraught With Neg...,BUSINESS,0
7958,Women in Business: Q&A with Donna Josephson Ch...,BUSINESS,0
691,Ladies From the Shark Tank Excellent advice fr...,BUSINESS,0
4905,Farmer Forced To Dump Insane Amount Of Gorgeou...,BUSINESS,0
8899,Why You Should Care About The Backlash Against...,BUSINESS,0


In [87]:
# going to create ml model

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dataframe_balanced.text,
    dataframe_balanced.category_number,
    test_size=0.2,
    random_state=1,
    stratify=dataframe_balanced.category_number
)



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("MultinomialNB", MultinomialNB())
])


classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)

print(classification_report(y_test,y_prediction))


              precision    recall  f1-score   support

           0       0.75      0.88      0.81       276
           1       0.93      0.85      0.89       277
           2       0.92      0.88      0.90       276
           3       0.89      0.85      0.87       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105



In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("MultinomialNB", MultinomialNB())
])


classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)

print(classification_report(y_test,y_prediction))


              precision    recall  f1-score   support

           0       0.69      0.91      0.78       276
           1       0.93      0.82      0.87       277
           2       0.92      0.84      0.88       276
           3       0.91      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.86      0.84      0.84      1105
weighted avg       0.86      0.84      0.84      1105



In [98]:

X_test[:10]

3797     Were Dinosaurs Cold-Blooded Or Warm-Blooded? O...
2420                 What's Really Going On With Twitter? 
4681     Donald Trump Reportedly Wants An Ex-Goldman Ex...
5117     Snow Volleyball In The Olympics? Pyeongchang E...
7020     The Classic 'Masculine' Business Model Works B...
10274    ETs Could Reach Out To Us With Interstellar La...
3611     Search Underway After Texas Woman Falls Off Ca...
8822     The Golden State Warriors Just Splashed Their ...
7195     Family Of Four Found Dead In Michigan Home "We...
7712     Juggling All the Balls All talk last week revo...
Name: text, dtype: object

In [99]:
y_test[:10]

3797     3
2420     0
4681     0
5117     1
7020     0
10274    3
3611     2
8822     1
7195     2
7712     1
Name: category_number, dtype: int64

In [101]:
y_prediction[:10]

array([3, 0, 0, 1, 0, 3, 2, 1, 2, 1])

In [103]:
# preprocessing

dataframe_balanced["preprocessed_text"] = dataframe_balanced.text.apply(preprocess)

dataframe_balanced.head()


Unnamed: 0,text,category,category_number,preprocessed_text
9625,Taking Your Startup Public Is Fraught With Neg...,BUSINESS,0,take Startup Public Fraught negative old day e...
7958,Women in Business: Q&A with Donna Josephson Ch...,BUSINESS,0,woman business Q&A Donna Josephson Chief Marke...
691,Ladies From the Shark Tank Excellent advice fr...,BUSINESS,0,lady Shark Tank Excellent advice lady continue...
4905,Farmer Forced To Dump Insane Amount Of Gorgeou...,BUSINESS,0,Farmer force dump Insane Gorgeous Cherries che...
8899,Why You Should Care About The Backlash Against...,BUSINESS,0,care backlash Holiday Creep term holiday creep...


In [104]:
X_train, X_test, y_train, y_test = train_test_split(
    dataframe_balanced.preprocessed_text,
    dataframe_balanced.category_number,
    test_size=0.2,
    random_state=1,
    stratify=dataframe_balanced.category_number
)


In [105]:

classifier = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2))),
    ("MultinomialNB", MultinomialNB())
])


classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)

print(classification_report(y_test,y_prediction))


              precision    recall  f1-score   support

           0       0.79      0.86      0.82       276
           1       0.93      0.88      0.90       277
           2       0.89      0.92      0.91       276
           3       0.90      0.85      0.87       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105

