In [1]:
import pandas as pd
import numpy as np

import nltk
import gensim
import json
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

df = pd.read_csv('news_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop = stopwords.words('english')
stemmer = SnowballStemmer('english')

  from imp import reload


In [2]:
df.columns

Index(['id', 'page_id', 'name', 'message', 'description', 'caption',
       'post_type', 'status_type', 'likes_count', 'comments_count',
       'shares_count', 'love_count', 'wow_count', 'haha_count', 'sad_count',
       'thankful_count', 'angry_count', 'link', 'picture', 'posted_at',
       'source'],
      dtype='object')

In [3]:
df.head()

Unnamed: 0,id,page_id,name,message,description,caption,post_type,status_type,likes_count,comments_count,...,love_count,wow_count,haha_count,sad_count,thankful_count,angry_count,link,picture,posted_at,source
0,"﻿""86680728811_272953252761568""",86680728811,Chief Justice Roberts Responds to Judicial Eth...,Roberts took the unusual step of devoting the ...,PAUL J. RICHARDS/AFP/Getty Images Chief Justic...,abcnews.go.com,link,shared_story,61,27,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/headlines/2011/12/...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 00:30:26,abc
1,"﻿""86680728811_273859942672742""",86680728811,"With Reservations, Obama Signs Act to Allow De...",Do you agree with the new law?,"In his last official act of business in 2011, ...",abcnews.go.com,link,shared_story,120,523,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/politics/2011/12/w...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 01:08:58,abc
2,"﻿""86680728811_10150499874478812""",86680728811,Wishes For 2012 to Fall on Times Square,Some pretty cool confetti will rain down on Ne...,The wishes of thousands of people will flutter...,abcnews.go.com,link,published_story,271,31,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/headlines/2011/12/...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 02:00:37,abc
3,"﻿""86680728811_244555465618151""",86680728811,Mitt Romney Vows to Veto Dream Act if President,,"Eric Gay/AP Photo SIOUX CITY, Iowa – Mitt Romn...",abcnews.go.com,link,shared_story,140,188,...,0,0,0,0,0,0,http://abcnews.go.com/blogs/politics/2011/12/m...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 02:35:20,abc
4,"﻿""86680728811_252342804833247""",86680728811,"NY Pharmacy Shootout Leaves Suspect, ATF Agent...",The pharmacy was held up by a man seeking pres...,A shootout at a suburban New York family pharm...,abcnews.go.com,link,shared_story,59,51,...,0,0,0,0,0,0,http://abcnews.go.com/US/ny-pharmacy-shootout-...,https://external.xx.fbcdn.net/safe_image.php?d...,2012-01-01 03:36:01,abc


In [4]:
drop_columns = ['id', 'page_id', 'link', 'picture']
df.drop(drop_columns, axis=1,inplace=True)
df.shape

(286959, 17)

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
text_data = ['name', 'message', 'description', 'caption']
quantitative = ['likes_count', 'comments_count', 'shares_count',
                'love_count','wow_count', 'haha_count', 'sad_count',
                'thankful_count', 'angry_count',]
descriptive_data = ['post_type', 'status_type', 'posted_at', 'source']

text_df = df.loc[:,text_data]
target_df = df.loc[:,quantitative]
quan_df = df.loc[:,descriptive_data]

### Text Processing

In [34]:
text_df

Unnamed: 0,name,message,description,caption
0,Chief Justice Roberts Responds to Judicial Eth...,Roberts took the unusual step of devoting the ...,PAUL J. RICHARDS/AFP/Getty Images Chief Justic...,abcnews.go.com
1,"With Reservations, Obama Signs Act to Allow De...",Do you agree with the new law?,"In his last official act of business in 2011, ...",abcnews.go.com
2,Wishes For 2012 to Fall on Times Square,Some pretty cool confetti will rain down on Ne...,The wishes of thousands of people will flutter...,abcnews.go.com
3,Mitt Romney Vows to Veto Dream Act if President,,"Eric Gay/AP Photo SIOUX CITY, Iowa – Mitt Romn...",abcnews.go.com
4,"NY Pharmacy Shootout Leaves Suspect, ATF Agent...",The pharmacy was held up by a man seeking pres...,A shootout at a suburban New York family pharm...,abcnews.go.com
...,...,...,...,...
286954,Donald Trump Says Tough Campaign Worth It Desp...,"“It took massive amounts of work, incredible a...","Donald Trump, in the last hours of his preside...",blogs.wsj.com
286955,Donald Trump Says Loss Would Spell ‘Single Gre...,"As he has many times, Donald J. Trump cast his...",Republican presidential nominee Donald Trump h...,wsj.com
286956,Donald Trump’s Loyal Numbers Man,"As the Trump Organization's finance chief, All...",Allen Weisselberg maintains a low-key profile ...,blogs.wsj.com
286957,HealthCare.gov Site Straining to Keep Up With ...,HealthCare.gov has been straining to handle th...,"Online “waiting rooms,” where people are sent ...",wsj.com


In [7]:
def stem(text):
    v_lemmed = [WordNetLemmatizer().lemmatize(word) for word in text if len(list(word))>4]
    return [stemmer.stem(word) for word in v_lemmed]

def lemmatization(texts):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

def clean_text(post_df, column):
    post_df[column] = post_df[column].str.replace(r"[^\w\s]", " ", regex=True)
    post_df[column].fillna("-", inplace=True)
    post_df[column] = post_df[column].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    post_df[column] = post_df[column].str.replace(r'http?.*.com?', ' ', regex=True)
    post_df[column] = post_df[column].str.split()
    post_df[column] = lemmatization(post_df[column])
    post_df[column] = post_df[column].apply(stem)

In [8]:
for col in text_df.columns:
    print(f"The {col} is being processed.")
    clean_text(text_df,col)

The name is being processed.
The message is being processed.
The description is being processed.
The caption is being processed.


In [9]:
text_df.head()

Unnamed: 0,name,message,description,caption
0,"[chief, justic, robert, respond, judici, ethic...","[robert, unusu, devot, major, annual, report, ...","[richard, getti, imag, chief, justic, robert, ...",[abcnew]
1,"[reserv, obama, sign, allow, detent, citizen]",[agre],"[offici, busi, presid, barack, obama, nation, ...",[abcnew]
2,"[time, squar]","[pretti, confetti, celebr]","[thousand, peopl, flutter, build, descend, tim...",[abcnew]
3,"[romney, dream, presid]",[],"[photo, sioux, romney, explicit, state, today,...",[abcnew]
4,"[pharmaci, shootout, leav, suspect, agent]","[pharmaci, prescript, medic]","[shootout, suburban, famili, pharmaci, leav, s...",[abcnew]


In [10]:
text_df.to_csv('./processed_text.csv', encoding='utf-8')

In [11]:
num_rows = text_df.shape[0]

In [12]:
def get_freq(col):
    word_freq = {}
    for i in range(num_rows):
        dist = nltk.FreqDist(text_df[col].iloc[i])
        for key in dist.keys():
            if key in word_freq.keys():
                word_freq[key] += dist[key]
            else:
                word_freq[key] = dist[key]
    return word_freq

col_word_freq = {}
for col in text_df.columns:
    print(f'The word Frequencies are being computed for the column:{col}')
    col_word_freq[col] = get_freq(col)

The word Frequencies are being computed for the column:name
The word Frequencies are being computed for the column:message
The word Frequencies are being computed for the column:description
The word Frequencies are being computed for the column:caption


In [13]:
import json

with open('word_freq.json', 'w') as f:
    json.dump(col_word_freq,f)

In [14]:
col_word_freq

{'name': {'chief': 912,
  'justic': 623,
  'robert': 316,
  'respond': 471,
  'judici': 5,
  'ethic': 62,
  'critic': 854,
  'reserv': 119,
  'obama': 6175,
  'sign': 122,
  'allow': 505,
  'detent': 93,
  'citizen': 176,
  'time': 1666,
  'squar': 214,
  'romney': 746,
  'dream': 432,
  'presid': 2764,
  'pharmaci': 17,
  'shootout': 118,
  'leav': 1961,
  'suspect': 2397,
  'agent': 267,
  'world': 4287,
  'ring': 41,
  'break': 1375,
  'magnitud': 74,
  'quak': 314,
  'japan': 929,
  'tsunami': 99,
  'warn': 345,
  'buyer': 136,
  'collect': 206,
  'settlement': 192,
  'coast': 636,
  'guard': 426,
  'passeng': 491,
  'limit': 406,
  'reflect': 193,
  'american': 3124,
  'weight': 275,
  'deepli': 44,
  'concern': 488,
  'creat': 520,
  'mutat': 22,
  'bird': 56,
  'again': 509,
  'polic': 5132,
  'georgia': 291,
  'woman': 4144,
  'sever': 393,
  'parent': 1152,
  'arrest': 2112,
  'alleg': 1352,
  'post': 128,
  'photo': 26554,
  'bound': 86,
  'facebook': 1689,
  'person': 664,
 

In [15]:
vocab = []
for key in col_word_freq.keys():
    vocab.extend(list(set(col_word_freq[key].keys())))
vocab = set(vocab)

In [16]:
combined = []
for i in range(text_df.shape[0]):
    temp=[]
    for col in range(len(col_word_freq.keys())):
        temp.extend(text_df.iloc[i,col])
    combined.append(list(set(temp)))

text_df['Unique_tokens'] = combined
l = []
for x in text_df.Unique_tokens:
    l.extend(x)

freq = dict(nltk.FreqDist(l))

sorted_freq = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
filtered = {}
# for k,v in freq:
#     if (v>15) and (v<400):
#         print(freq)
temp = np.array(list(sorted_freq.values()))

In [17]:
sorted_freq

{'nbcnew': 39851,
 'nytim': 35078,
 'photo': 30264,
 'latim': 25675,
 'timelin': 22331,
 'peopl': 22087,
 'state': 18939,
 'abcnew': 18291,
 'first': 17716,
 'world': 16791,
 'presid': 16434,
 'polic': 15756,
 'trump': 15424,
 'american': 14014,
 'could': 13698,
 'report': 13688,
 'cbsnew': 13592,
 'would': 13348,
 'donald': 13267,
 'woman': 13004,
 'nation': 12562,
 'offici': 12534,
 'obama': 12088,
 'time': 11999,
 'watch': 11578,
 'shoot': 11320,
 'attack': 10991,
 'today': 10758,
 'video': 10609,
 'think': 10457,
 'countri': 10005,
 'after': 9924,
 'clinton': 9909,
 'leav': 9848,
 'offic': 9664,
 'million': 9612,
 'death': 9473,
 'famili': 9367,
 'stori': 9061,
 'break': 8658,
 'becom': 8578,
 'month': 8568,
 'hous': 8512,
 'hillari': 8385,
 'child': 8242,
 'three': 8202,
 'republican': 8014,
 'chang': 7648,
 'former': 7549,
 'right': 7527,
 'question': 7504,
 'night': 7434,
 'school': 7322,
 'campaign': 7196,
 'presidenti': 7188,
 'still': 7188,
 'includ': 7177,
 'polit': 7106,
 '

In [18]:
quantiles = [0.999,0.99, 0.98, 0.975, 0.95, 0.8, 0.7, 0.6, 0.5 ]
for qunatile in quantiles:
    print(f'Quantile: {qunatile*100} \t Freq: {np.quantile(temp, qunatile)}')

print('---------------------')

Quantile: 99.9 	 Freq: 4485.60400000005
Quantile: 99.0 	 Freq: 618.0
Quantile: 98.0 	 Freq: 236.0
Quantile: 97.5 	 Freq: 167.0
Quantile: 95.0 	 Freq: 51.0
Quantile: 80.0 	 Freq: 3.0
Quantile: 70.0 	 Freq: 1.0
Quantile: 60.0 	 Freq: 1.0
Quantile: 50.0 	 Freq: 1.0
---------------------


In [19]:
from multiprocessing.sharedctypes import Value

filtered = {}

for key in  sorted_freq.keys():
    if  (sorted_freq[key]>=5) and (sorted_freq[key]<=618):
        filtered[key] = sorted_freq[key]

final_list = list(filtered.keys())

filtered_tokens = []
print('Token filtering started.')

for row in text_df.Unique_tokens:
    temp = []
    for word in row:
        if word in final_list:
            temp.append(word)
    filtered_tokens.append(temp)

for i in range(10):
    print(f'The filtered_tokens\t :{filtered_tokens[i]}')
    print(f'The Original\t\t :{text_df.Unique_tokens[i]} \n')

text_df['Unique_tokens'] = filtered_tokens

Token filtering started.
The filtered_tokens	 :['devot', 'colleagu', 'judici', 'capabl', 'ethic']
The Original		 :['issu', 'devot', 'critic', 'major', 'getti', 'determin', 'abcnew', 'colleagu', 'chief', 'interest', 'complet', 'robert', 'judici', 'night', 'capabl', 'respond', 'report', 'ethic', 'richard', 'imag', 'confid', 'abil', 'endors', 'unusu', 'annual', 'conflict', 'justic', 'saturday'] 

The filtered_tokens	 :['detent', 'sign', 'compon', 'hawaii', 'rental', 'provis']
The Original		 :['allow', 'agre', 'abcnew', 'detent', 'sign', 'obama', 'compon', 'barack', 'offici', 'vacat', 'hawaii', 'kailua', 'busi', 'defens', 'author', 'presid', 'statement', 'citizen', 'nation', 'rental', 'provis', 'controversi', 'reserv', 'includ'] 

The filtered_tokens	 :['descend', 'flutter', 'confetti']
The Original		 :['tomorrow', 'pretti', 'build', 'celebr', 'abcnew', 'thousand', 'descend', 'flutter', 'time', 'peopl', 'icon', 'confetti', 'squar'] 

The filtered_tokens	 :['sioux', 'perman', 'explicit', 'c

In [40]:
text_df['Unique_tokens'] = filtered_tokens
texts = list(text_df.Unique_tokens)
id2word = gensim.corpora.Dictionary(text_df.Unique_tokens)
corpus = [id2word.doc2bow(doc) for doc in text_df.Unique_tokens]

In [21]:
for i in range(50):
    print(f'{list(sorted_freq.keys())[i]} : {list(sorted_freq.values())[i]}')

nbcnew : 39851
nytim : 35078
photo : 30264
latim : 25675
timelin : 22331
peopl : 22087
state : 18939
abcnew : 18291
first : 17716
world : 16791
presid : 16434
polic : 15756
trump : 15424
american : 14014
could : 13698
report : 13688
cbsnew : 13592
would : 13348
donald : 13267
woman : 13004
nation : 12562
offici : 12534
obama : 12088
time : 11999
watch : 11578
shoot : 11320
attack : 10991
today : 10758
video : 10609
think : 10457
countri : 10005
after : 9924
clinton : 9909
leav : 9848
offic : 9664
million : 9612
death : 9473
famili : 9367
stori : 9061
break : 8658
becom : 8578
month : 8568
hous : 8512
hillari : 8385
child : 8242
three : 8202
republican : 8014
chang : 7648
former : 7549
right : 7527


In [31]:
texts

[['devot', 'colleagu', 'judici', 'capabl', 'ethic'],
 ['detent', 'sign', 'compon', 'hawaii', 'rental', 'provis'],
 ['descend', 'flutter', 'confetti'],
 ['sioux', 'perman', 'explicit', 'criterion', 'proof'],
 ['pharmaci', 'suburban', 'shootout', 'confus', 'robber', 'prescript'],
 ['ring'],
 ['quak', 'magnitud', 'tsunami'],
 ['electron', 'manufactur', 'settlement', 'buyer', 'samsung', 'sharp'],
 ['implement'],
 ['transmiss', 'mutat', 'contagi', 'netherland', 'manipul', 'deepli'],
 ['blackbird', 'bird', 'again'],
 ['swipe', 'knife'],
 ['depict', 'equip', 'post', 'upsid', 'bound'],
 ['survivalist',
  'benjamin',
  'ranger',
  'anderson',
  'colton',
  'tribun',
  'barn',
  'rainier',
  'margaret'],
 [],
 ['laugh', 'creator', 'accomplish', 'stuff', 'slacker'],
 ['latino', 'fifti', 'deport', 'disapprov', 'hispan', 'scorn'],
 ['blaze', 'detent', 'arson', 'deport'],
 ['obes'],
 ['epidem', 'parallel', 'heartburn', 'reflux', 'obes', 'symptom'],
 ['chimp', 'tarzan', 'sanctuari', 'primat', 'outrea

In [41]:
for i in range(5):
    print(f'Before loop no.{i+1}:{len(texts)}')
    print(texts.count([]))
    for x in texts:
        if len(x) == 0:
            texts.remove(x)

    print(f'After loop no.{i+1}:{len(texts)}')

Before loop no.1:286951
12025
After loop no.1:286951
Before loop no.2:286951
12025
After loop no.2:286951
Before loop no.3:286951
12025
After loop no.3:286951
Before loop no.4:286951
12025
After loop no.4:286951
Before loop no.5:286951
12025
After loop no.5:286951


12025

In [None]:
x.emp

In [None]:
score=[]

for k in range(4,13): # Train LDA on different values of k
    print('Number of topics: '+str(k))


    ldamodel =gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k,
                                       random_state=30,
                                       chunksize=100,
                                       passes=20,
                                       per_word_topics=True)
    print(f'The LDA model for {k} topics is generated. Evaluation is taking place...')

    #Calculating the coherence
    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=texts, dictionary=id2word, coherence='c_v').get_coherence()

    print(f'The model for {k} topics is evaluated. The coherence score is {cm}')
    print('\n')
    score.append((k, cm))


score = pd.DataFrame(score)

Number of topics: 4
