In [182]:
from sklearn.feature_extraction.text import CountVectorizer

In [183]:
X = ['Build your profile to connect with students and professionals around the world']

V = CountVectorizer(ngram_range = (2, 2))

V.fit(X)

print(V.vocabulary_)

{'build your': 2, 'your profile': 10, 'profile to': 5, 'to connect': 8, 'connect with': 3, 'with students': 9, 'students and': 6, 'and professionals': 0, 'professionals around': 4, 'around the': 1, 'the world': 7}


In [184]:
wd = ['I ate Ice Cream',
     'He is tall',
     'He is eating Ice Cream']

In [185]:
import spacy as s

In [186]:
nlp = s.load('en_core_web_sm')

def func(text):
    doc = nlp(text)
    
    word_list = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        word_list.append(token.lemma_)
    return ' '.join(word_list)

In [187]:
func('Build your profile to connect with students and professionals around the world')

'build profile connect student professional world'

In [188]:
wd_process = [func(text) for text in X]
wd_process

['build profile connect student professional world']

In [189]:
CV = CountVectorizer(ngram_range = (2, 2))
CV.fit(wd_process)
CV.vocabulary_

{'build profile': 0,
 'profile connect': 3,
 'connect student': 1,
 'student professional': 4,
 'professional world': 2}

In [190]:
import pandas as pd

In [191]:
df = pd.read_csv('news-article-categories.csv')
df.head()

Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


In [192]:
print(df.category.value_counts())

category
ARTS & CULTURE    1002
BUSINESS           501
ENTERTAINMENT      501
ENVIRONMENT        501
POLITICS           501
RELIGION           501
SPORTS             501
TECH               501
WOMEN              501
EDUCATION          490
COMEDY             380
SCIENCE            350
MEDIA              347
CRIME              300
Name: count, dtype: int64


In [193]:
min_value_count = 300

df_art_and_culture = df[df.category == 'ARTS & CULTURE'].sample(min_value_count, random_state = 100)
df_business = df[df.category == 'BUSINESS'].sample(min_value_count, random_state = 100)
df_entertaintment = df[df.category == 'ENTERTAINMENT'].sample(min_value_count, random_state = 100)

df_environmental = df[df.category == 'ENVIRONMENT'].sample(min_value_count, random_state = 100)
df_politics = df[df.category == 'POLITICS'].sample(min_value_count, random_state = 100)
df_religinon = df[df.category == 'RELIGION'].sample(min_value_count, random_state = 100)

df_sport = df[df.category == 'SPORTS'].sample(min_value_count, random_state = 100)
df_tech = df[df.category == 'TECH'].sample(min_value_count, random_state = 100)
df_women = df[df.category == 'WOMEN'].sample(min_value_count, random_state = 100)

df_education = df[df.category == 'EDUCATION'].sample(min_value_count, random_state = 100)
df_comedy = df[df.category == 'COMEDY'].sample(min_value_count, random_state = 100)
df_scinece = df[df.category == 'SCIENCE'].sample(min_value_count, random_state = 100)

df_media = df[df.category == 'MEDIA'].sample(min_value_count, random_state = 100)
df_crime = df[df.category == 'CRIME'].sample(min_value_count, random_state = 100)


In [194]:
df_1 = pd.concat([df_art_and_culture, df_business, df_entertaintment, df_environmental, df_politics, df_religinon,
                 df_sport, df_tech, df_women, df_education, df_comedy, df_scinece, df_media, df_crime], axis = 0)

In [195]:
df_1.category.unique()

array(['ARTS & CULTURE', 'BUSINESS', 'ENTERTAINMENT', 'ENVIRONMENT',
       'POLITICS', 'RELIGION', 'SPORTS', 'TECH', 'WOMEN', 'EDUCATION',
       'COMEDY', 'SCIENCE', 'MEDIA', 'CRIME'], dtype=object)

In [196]:
df_1.category.value_counts()

category
ARTS & CULTURE    300
BUSINESS          300
ENTERTAINMENT     300
ENVIRONMENT       300
POLITICS          300
RELIGION          300
SPORTS            300
TECH              300
WOMEN             300
EDUCATION         300
COMEDY            300
SCIENCE           300
MEDIA             300
CRIME             300
Name: count, dtype: int64

In [197]:
target = {'ARTS & CULTURE' : 1, 'BUSINESS' : 2, 'ENTERTAINMENT' : 3, 'ENVIRONMENT' : 4,
       'POLITICS' : 5, 'RELIGION' : 6, 'SPORTS' : 7, 'TECH' : 8, 'WOMEN' : 9, 'EDUCATION' : 10,
       'COMEDY' : 11, 'SCIENCE' : 12, 'MEDIA' : 13, 'CRIME' : 14}

df_1['cat_num'] = df_1.category.map(target)

df_1.head()

Unnamed: 0,category,title,body,cat_num
249,ARTS & CULTURE,Help A Queens Museum Create A Permanent Home F...,Kermit the Frog. Miss Piggy. Elmo. Cookie Mons...,1
353,ARTS & CULTURE,Artist Honors The History Of Broken Promises T...,"From a distance, Gina Adams’ quilts look famil...",1
537,ARTS & CULTURE,Interview With Jeff Koons,Jeff Koons. Photo: Elena Cué American artist...,1
424,ARTS & CULTURE,D.C. Museum Dedicated To Women In Art Broke At...,"The weekend of Donald Trump’s inauguration, wo...",1
564,ARTS & CULTURE,William Kentridge's 'Greatest Drawing Ever' Un...,William Kentridge at work on Triumphs and La...,1


In [198]:
df_1 = df_1.dropna()

In [199]:
df_1.isna().sum()

category    0
title       0
body        0
cat_num     0
dtype: int64

In [200]:
from sklearn.model_selection import train_test_split

In [201]:
X_train, X_test, Y_train, Y_test = train_test_split(df_1.body, df_1.cat_num, test_size=0.2, random_state=100, stratify = df_1.cat_num)

In [202]:
print(X_train.shape)
X_train.head()

(3357,)


509     In 2012, the German artist Thomas Bayrle cause...
5767    An undefeated little league softball team in V...
2416    President Trump has a new secretary of educati...
3811    After going on a defiant media spree on Monday...
4894    A neo-Nazi website is calling for an armed pro...
Name: body, dtype: object

In [203]:
Y_test.value_counts()

cat_num
6     60
11    60
8     60
12    60
4     60
1     60
9     60
2     60
13    60
5     60
14    60
7     60
10    60
3     60
Name: count, dtype: int64

In [204]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [214]:
clf = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('MultinomialNB', MultinomialNB())
])

clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           1       0.69      0.88      0.77        60
           2       0.58      0.67      0.62        60
           3       0.58      0.57      0.57        60
           4       0.82      0.67      0.73        60
           5       0.60      0.77      0.67        60
           6       0.82      0.88      0.85        60
           7       0.88      0.82      0.84        60
           8       0.94      0.53      0.68        60
           9       0.66      0.70      0.68        60
          10       0.63      0.87      0.73        60
          11       0.89      0.13      0.23        60
          12       0.82      0.75      0.78        60
          13       0.60      0.80      0.69        60
          14       0.78      0.83      0.81        60

    accuracy                           0.70       840
   macro avg       0.73      0.70      0.69       840
weighted avg       0.73      0.70      0.69       840



In [215]:
print(X_test[:10])
print('')
print('Y Test:- ',Y_test[:10])
print('')
print('Y Prediction:- ',Y_pred[:10])

4780    In a rallying cry against religious extremism,...
1749    Actor Alec Baldwin called into a radio station...
1664    Seth Meyers took aim at the GOP for staying si...
5986    Surveillance video shows the shocking moment a...
5328    A giant sunspot has apparently turned on its l...
3292    This week brought several big headlines in ext...
5215    A NASA observatory witnessed an incredible dou...
24      In 2013, when Glenn Cantave was an undergrad s...
1551    Jordan Klepper of the “The Opposition” wants t...
6579    With the #MeToo movement gaining in the world ...
Name: body, dtype: object

Y Test:-  4780     6
1749    11
1664    11
5986     8
5328    12
3292     4
5215    12
24       1
1551    11
6579     9
Name: cat_num, dtype: int64

Y Prediction:-  [ 6 13 13 14 12  4 12  6  5  9]


In [213]:
"""clf = Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),
        ('Multi NB', MultinomialNB())
])

clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print(classification_report(Y_test, Y_pred))"""

"clf = Pipeline([\n        ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),\n        ('Multi NB', MultinomialNB())\n])\n\nclf.fit(X_train, Y_train)\n\nY_pred = clf.predict(X_test)\n\nprint(classification_report(Y_test, Y_pred))"

In [208]:
#X_test[:3]

In [209]:
#print('Y Test:- ',Y_test[:3])
#print('')
#print('Y Prediction:- ',Y_pred[:3])

In [217]:
df_1['preprocess_body'] = df_1.body.apply(func)

In [218]:
df_1.head()

Unnamed: 0,category,title,body,cat_num,preprocess_body
249,ARTS & CULTURE,Help A Queens Museum Create A Permanent Home F...,Kermit the Frog. Miss Piggy. Elmo. Cookie Mons...,1,kermit Frog Miss Piggy Elmo Cookie Monster Fra...
353,ARTS & CULTURE,Artist Honors The History Of Broken Promises T...,"From a distance, Gina Adams’ quilts look famil...",1,distance Gina Adams quilt look familiar wear s...
537,ARTS & CULTURE,Interview With Jeff Koons,Jeff Koons. Photo: Elena Cué American artist...,1,Jeff Koons photo Elena Cué american artist...
424,ARTS & CULTURE,D.C. Museum Dedicated To Women In Art Broke At...,"The weekend of Donald Trump’s inauguration, wo...",1,weekend Donald Trump inauguration woman ally W...
564,ARTS & CULTURE,William Kentridge's 'Greatest Drawing Ever' Un...,William Kentridge at work on Triumphs and La...,1,William Kentridge work Triumphs Laments Joh...


In [219]:
X_train, X_test, Y_train, Y_test = train_test_split(df_1.preprocess_body, 
                                                    df_1.cat_num, 
                                                    test_size=0.2, 
                                                    random_state=100, 
                                                    stratify = df_1.cat_num)

In [220]:
clf_1 = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('MultinomialNB', MultinomialNB())
])

clf_1.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           1       0.85      0.83      0.84        60
           2       0.69      0.77      0.72        60
           3       0.61      0.68      0.65        60
           4       0.77      0.80      0.79        60
           5       0.66      0.82      0.73        60
           6       0.84      0.90      0.87        60
           7       0.86      0.93      0.90        60
           8       0.85      0.67      0.75        60
           9       0.73      0.58      0.65        60
          10       0.78      0.85      0.82        60
          11       0.94      0.55      0.69        60
          12       0.83      0.75      0.79        60
          13       0.71      0.80      0.75        60
          14       0.78      0.85      0.82        60

    accuracy                           0.77       840
   macro avg       0.78      0.77      0.77       840
weighted avg       0.78      0.77      0.77       840



In [221]:
clf_1 = Pipeline([
        ('CountVectorizer', CountVectorizer(ngram_range = (2, 2))),
        ('MultinomialNB', MultinomialNB())
])

clf_1.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           1       0.85      0.83      0.84        60
           2       0.69      0.77      0.72        60
           3       0.61      0.68      0.65        60
           4       0.77      0.80      0.79        60
           5       0.66      0.82      0.73        60
           6       0.84      0.90      0.87        60
           7       0.86      0.93      0.90        60
           8       0.85      0.67      0.75        60
           9       0.73      0.58      0.65        60
          10       0.78      0.85      0.82        60
          11       0.94      0.55      0.69        60
          12       0.83      0.75      0.79        60
          13       0.71      0.80      0.75        60
          14       0.78      0.85      0.82        60

    accuracy                           0.77       840
   macro avg       0.78      0.77      0.77       840
weighted avg       0.78      0.77      0.77       840

