In [1]:
#importing libraries
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns

#NLTK libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#Libraries for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances, accuracy_score,f1_score

In [10]:
df = pd.read_json('News_Category_Dataset_v2.json',lines = True)

In [11]:
df.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
authors              200853 non-null object
category             200853 non-null object
date                 200853 non-null datetime64[ns]
headline             200853 non-null object
link                 200853 non-null object
short_description    200853 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [13]:
#since dataset is large we are refraining from using the whole dataset and use latest articles from year 2018
df = df[df['date'] >= pd.Timestamp(2018,1,1)]

In [14]:
df.shape

(8583, 6)

In [15]:
#Removing all short headlines articles
df = df[df['headline'].apply(lambda x :len(x.split())>5)]
df.shape

(8530, 6)

In [16]:
#removing duplicated headlined articles
df.sort_values('headline',inplace = True,ascending = False)
duplicated_df = df.duplicated('headline',keep = False)
df = df[~duplicated_df]
df.shape

(8485, 6)

In [17]:
#checking for missing values
df.isna().sum()

authors              0
category             0
date                 0
headline             0
link                 0
short_description    0
dtype: int64

In [18]:
df['authors'].nunique()

892

In [19]:
df['category'].nunique()

26

In [20]:
## Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
df['day and month'] = df['date'].dt.strftime('%a') + "_" + df['date'].dt.strftime('%b')

In [27]:
df_temp = df.copy()

In [22]:
df_temp.head()

Unnamed: 0,authors,category,date,headline,link,short_description,day and month
2932,Elyse Wanshel,QUEER VOICES,2018-04-02,‘Will & Grace’ Creator To Donate Gay Bunny Boo...,https://www.huffingtonpost.com/entry/will-grac...,It's about to be a lot easier for kids in Mike...,Mon_Apr
4487,"Lyndsey Parker, Yahoo Entertainment",QUEER VOICES,2018-03-06,‘The Voice’ Blind Auditions Make History With ...,https://www.huffingtonpost.com/entry/the-voice...,"Austin Giorgio, 21: “How Sweet It Is (To Be Lo...",Tue_Mar
8255,"Sarah Emily Baum, ContributorFreelance Writer",QUEER VOICES,2018-01-05,‘The Penumbra’ Is The Queer Audio Drama You Di...,https://www.huffingtonpost.com/entry/the-penum...,"Young, fun, fantastical and, most notably, inc...",Fri_Jan
744,Ed Mazza,COMEDY,2018-05-11,‘The Opposition’ Gives Trump A Hot Lawyer Of H...,https://www.huffingtonpost.com/entry/trump-hot...,"He's here to make a ""strong case"" for the pres...",Fri_May
2893,Elyse Wanshel,ENTERTAINMENT,2018-04-03,‘Stranger Things’ Fans Will Be Able To Visit T...,https://www.huffingtonpost.com/entry/stranger-...,"Hawkins is headed to Hollywood, Orlando and Si...",Tue_Apr


In [28]:
#Text Preprocessing
#stopwords removal
#stop_words = set(stopwords.words('english'))
#for i in range(len(df_temp['headline'])):
 #   string = ""
  #  for word in df_temp['headline'][i].split():
   #     word = ("".join(e for e in word if e.isalnum()))
    #    word = word.lower
     #   if not word in stop_words:
      #      string += word + " "
       # if(i%1000==0):
        #    print(i)
        #df_temp.at[i,'headline'] = string.strip()
stop = stopwords.words('english')
df_temp['headline'] = df_temp['headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

df_temp['headline'] = df_temp['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [29]:
df_temp['headline'].head()

2932    ‘will & grace’ creator to donate gay bunny boo...
4487    ‘the voice’ blind auditions make history with ...
8255    ‘the penumbra’ is the queer audio drama you di...
744     ‘the opposition’ gives trump a hot lawyer of h...
2893    ‘stranger things’ fans will be able to visit t...
Name: headline, dtype: object

In [30]:
#Tokenizing
from nltk.tokenize import RegexpTokenizer
#instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')
df_temp['headline'] = df_temp['headline'].apply(lambda x: tokenizer.tokenize(x))

#Stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def word_stemmer(text):
    stem_text = " ".join([porter.stem(i) for i in text])
    return stem_text
df_temp['headline'] = df_temp['headline'].apply(lambda x: word_stemmer(x))

In [31]:
df_temp.head()

Unnamed: 0,authors,category,date,headline,link,short_description,day and month
2932,Elyse Wanshel,QUEER VOICES,2018-04-02,will grace creator to donat gay bunni book to ...,https://www.huffingtonpost.com/entry/will-grac...,It's about to be a lot easier for kids in Mike...,Mon_Apr
4487,"Lyndsey Parker, Yahoo Entertainment",QUEER VOICES,2018-03-06,the voic blind audit make histori with first t...,https://www.huffingtonpost.com/entry/the-voice...,"Austin Giorgio, 21: “How Sweet It Is (To Be Lo...",Tue_Mar
8255,"Sarah Emily Baum, ContributorFreelance Writer",QUEER VOICES,2018-01-05,the penumbra is the queer audio drama you didn...,https://www.huffingtonpost.com/entry/the-penum...,"Young, fun, fantastical and, most notably, inc...",Fri_Jan
744,Ed Mazza,COMEDY,2018-05-11,the opposit give trump a hot lawyer of hi own,https://www.huffingtonpost.com/entry/trump-hot...,"He's here to make a ""strong case"" for the pres...",Fri_May
2893,Elyse Wanshel,ENTERTAINMENT,2018-04-03,stranger thing fan will be abl to visit the up...,https://www.huffingtonpost.com/entry/stranger-...,"Hawkins is headed to Hollywood, Orlando and Si...",Tue_Apr


In [33]:
from sklearn.feature_extraction.text import CountVectorizer
headline_vectorizer = CountVectorizer()
headline_features = headline_vectorizer.fit_transform(df_temp['headline'])

In [36]:
headline_features.get_shape()

(8485, 9041)

In [37]:
pd.set_option('display.max_colwidth', -1)  # To display a very long headline completely

In [38]:
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df_1 = pd.DataFrame({'publish_date': df['date'][indices].values,
               'headline':df['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',df['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df_1.iloc[1:,1]
    return df_1.iloc[1:,]

bag_of_words_based_model(133, 11) # Change the row index for any other queried article

headline :  Can You Believe? Queer Eye Season 2 Drops Next Month



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-01-30,GOP Congressman Calls For Undocumented SOTU Guests To Be Arrested On The Spot,3.605551
2,2018-01-12,"Alton Sterling's Family Seeks Release Of Controversial Video, Evidence",3.605551
3,2018-04-23,"For The First Time, Here Are Jerry Garcia's Earliest Known Recorded Performances",3.605551
4,2018-02-08,Prison Or Deportation: The Impossible Choice For Asylum Seekers In Israel,3.741657
5,2018-05-17,"Boulder City Council Unanimously Votes To Ban Assault Weapons, High-Capacity Magazines",3.741657
6,2018-03-27,Larry Nassar's Longtime MSU Boss Arrested On Sexual Misconduct Charges,3.741657
7,2018-05-09,NY Legislature Has Plan In Place To Consider Replacements For Eric Schneiderman,3.741657
8,2018-01-22,"Lindsey Graham Slams Trump Aide, Says White House Staff Making Negotiations ‘Difficult’",3.741657
9,2018-05-07,"Childish Gambino's 'This Is America' Video, Explained",3.741657
10,2018-05-09,"Seth Meyers Decodes Teen Slang, Reveals What ‘Giuliani’ Really Means",3.741657


In [41]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(df_temp['headline'])

In [42]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df_2 = pd.DataFrame({'publish_date': df['date'][indices].values,
               'headline':df['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',df['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df_2.iloc[1:,1]
    return df_2.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Can You Believe? Queer Eye Season 2 Drops Next Month



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-04-26,James Comey Is 'Embarrassed And Ashamed' Of The Republican Party,1.223185
2,2018-05-11,Trump Just Launched The War On Christmas In May And People Are So Not Having It,1.251238
3,2018-01-23,"Matt Bomer, Zachary Quinto And More Prep 'Boys In The Band' For Broadway",1.25373
4,2018-02-13,New Jersey Deputy Mayor Compares Undocumented Immigrants To 'Rabid Raccoons',1.263168
5,2018-01-31,Trump Executive Order Helps Cement Guantanamo's Status As A Forever Prison,1.275761
6,2018-03-09,Colin Firth’s Wife Reveals She Had An Affair With Couple’s Alleged Stalker,1.277962
7,2018-02-09,Team USA's Openly Gay Athletes Aren't Backing Down From Feud With Vice President,1.289588
8,2018-04-23,"For The First Time, Here Are Jerry Garcia's Earliest Known Recorded Performances",1.289767
9,2018-02-09,2 Koreas Make History Marching Under Unified Flag In Olympics Opener,1.289893
10,2018-03-09,Charles Koch Complains About Corporate Influence In Politics In Surprising Op-Ed,1.292562
