In [1]:
import pandas as pd
import numpy as np

import re

from py2neo import authenticate, Graph, Node, Relationship


import os
import csv
import pickle

from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from IPython.display import display, HTML

# custom general helper functions for this project
import custom_utils as cu
import importlib


In [2]:
from collections import defaultdict

In [3]:
# Note: not sure why, but on running this gensim import the kernel kept dying.
# Running the following in command line fixed it:
# conda install -f numpy
import gensim

In [4]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/arinai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# reload imports as needed
importlib.reload(cu);

In [7]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

In [8]:
pd.options.display.max_colwidth = 100

In [59]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP = pickle.load(picklefile)

louvain_communities_for_NLP.head(20)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page
5,2634665.0,408421.0,4,34309.0,Aquaman_(film)
6,2328884.0,200893.0,4,192.0,Bird_Box
7,2231176.0,575481.0,3,3945.0,Priyanka_Chopra
8,2226602.0,117115.0,5,958.0,List_of_most-disliked_YouTube_videos
9,2050628.0,336621.0,5,4161.0,Freddie_Mercury


In [60]:
len(louvain_communities_for_NLP)

2729767

##### Pre-process article titles

In [61]:
louvain_communities_for_NLP_proc =louvain_communities_for_NLP.copy()
louvain_communities_for_NLP_proc.rename(index=str,columns={'title':'title_raw'}, inplace=True)
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title_raw
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page


In [63]:
# the importance of words will be weighted by search and link traffic
louvain_communities_for_NLP_proc["weight"] = louvain_communities_for_NLP_proc[[
        "external_search_traffic", "link_in_traffic", "search_in_traffic"]].sum(axis=1).astype('int64').fillna(0)

louvain_communities_for_NLP_proc.drop(["external_search_traffic", "link_in_traffic", "search_in_traffic"], 
                                      axis = 1,
                                      inplace=True)

In [64]:
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight
0,3,George_H._W._Bush,5690673
1,4,Jason_Momoa,4183872
2,9,2.0_(film),3722311
3,4,Bird_Box_(film),3945404
4,1,Main_Page,3051841


In [66]:
# clean up the title
louvain_communities_for_NLP_proc["title"] = \
    louvain_communities_for_NLP_proc.title_raw.str.replace('_', ' ')
    
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight,title
0,3,George_H._W._Bush,5690673,George H. W. Bush
1,4,Jason_Momoa,4183872,Jason Momoa
2,9,2.0_(film),3722311,2.0 (film)
3,4,Bird_Box_(film),3945404,Bird Box (film)
4,1,Main_Page,3051841,Main Page


In [68]:
louvain_communities_for_NLP_proc.describe()

Unnamed: 0,louvain_community,weight
count,2729767.0,2729767.0
mean,11.48,1731.93
std,64.77,13116.59
min,0.0,0.0
25%,3.0,66.0
50%,5.0,200.0
75%,10.0,735.0
max,1697.0,5690673.0


In [29]:
# Use spaCy to get Named Entities

# installation instructions: https://spacy.io/usage/

import spacy
nlp = spacy.load('en', disable=['parser', 'tagger'])

In [30]:
nlp.pipeline

[('ner', <spacy.pipeline.EntityRecognizer at 0x7ff34717b780>)]

In [None]:
print("Started running at", datetime.now(), "UTC")

start_time = timer()

nrows = len(louvain_communities_for_NLP_proc)
print("Total number of rows to process:", nrows, "\n")

for i in range(nrows):
    txt = louvain_communities_for_NLP_proc.iloc[i].title

    doc = nlp(txt)

    ents_arr = []
    for ent in doc.ents:
        ents_arr.append(ent.label_)
    
    louvain_communities_for_NLP_proc.at[str(i), "named_entities"] = " ".join(ents_arr)
    
    if (len(louvain_communities_for_NLP_proc) > nrows):
        print("ERROR!! Dataset length has increased.")
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "txt=", txt, "ents_arr=", ents_arr, "\n",
              "updated data row:\n", louvain_communities_for_NLP_proc.iloc[i], "\n")
        break
    
    if (i % 100000 == 0):
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "txt=", txt, "ents_arr=", ents_arr, "\n",
              "updated data row:\n", louvain_communities_for_NLP_proc.iloc[i], "\n")
    

cu.printRunTime(start_time)

Started running at 2019-03-01 19:57:26.153032 UTC
Total number of rows to process: 2729767 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.0001 min

Last row's processing vars:
i= 0 txt= George H. W. Bush ents_arr= ['PERSON'] 
 updated data row:
 louvain_community                    3
title_raw            George_H._W._Bush
weight                         5690673
title                George H. W. Bush
named_entities                  PERSON
Name: 0, dtype: object 

Rows processed: 3.6633 %, count =  100000
Elapsed time: 6.8751 min

Last row's processing vars:
i= 100000 txt= List of cities and boroughs in Pennsylvania by population ents_arr= ['GPE'] 
 updated data row:
 louvain_community                                                           10
title_raw            List_of_cities_and_boroughs_in_Pennsylvania_by_population
weight                                                                    5628
title                List of cities and boroughs in Pennsylvania by population
named

The code above took a few hours to run, and finished successfully, but the browser tunnel got interrupted at some point, so the print statements above is incomplete, but the dataset has been successfully populated with named_entities.

In [86]:
nrows

2729767

In [85]:
i

2729766

In [87]:
txt

'Christgau (disambiguation)'

In [93]:
louvain_communities_for_NLP_proc.iloc[i]

louvain_community                             5
title_raw            Christgau_(disambiguation)
weight                                       18
title                Christgau (disambiguation)
named_entities                              GPE
Name: 2729766, dtype: object

In [89]:
louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,PERSON
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [90]:
louvain_communities_for_NLP_proc.tail(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
2729747,16,Mantle_of_Luís_I,15,Mantle of Luís I,
2729748,13,List_of_Zero:_Black_Blood_episodes,14,List of Zero: Black Blood episodes,CARDINAL EVENT
2729749,22,Moskvitch_404_Sport,15,Moskvitch 404 Sport,CARDINAL
2729750,3,Caridina_loehae,11,Caridina loehae,GPE
2729751,3,Tatu_Miettunen,11,Tatu Miettunen,
2729752,2,HIST2H3C,10,HIST2H3C,
2729753,6,"Muk,_Iran",17,"Muk, Iran",GPE GPE
2729754,10,"Wila,_Missouri",67,"Wila, Missouri",PERSON GPE
2729755,3,Isaac_Rochussen,11,Isaac Rochussen,PERSON
2729756,7,Raffaello_Bertieri,26,Raffaello Bertieri,PERSON


The named entities assignment is not perfect, for example, both "Main Page" and "Bird Box" articles were labeled "PERSON", but in many cases it did quite well.  
The "Main Page" article seems to be an important node in the network, so let's fix its named_entities value.

In [95]:
louvain_communities_for_NLP_proc.iloc[4]

louvain_community            1
title_raw            Main_Page
weight                 3051841
title                Main Page
named_entities          PERSON
Name: 4, dtype: object

In [103]:
louvain_communities_for_NLP_proc.at[str(4), "named_entities"] = ''

In [104]:
louvain_communities_for_NLP_proc.iloc[4]

louvain_community            1
title_raw            Main_Page
weight                 3051841
title                Main Page
named_entities                
Name: 4, dtype: object

In [106]:
louvain_communities_for_NLP_proc.head(10)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [109]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_communities_for_NLP_proc.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(louvain_communities_for_NLP_proc, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_communities_for_NLP_proc.pkl


In [37]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP_proc.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP_proc = pickle.load(picklefile)

louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [30]:
louvain_communities_for_NLP_proc.named_entities.value_counts()

PERSON                                830645
                                      814109
ORG                                   390291
GPE                                   170318
DATE                                   46268
NORP                                   38910
GPE GPE                                38162
CARDINAL                               29662
ORG GPE                                28779
PERSON GPE                             21082
LOC                                    16358
DATE ORG                               15102
FAC                                    14702
PERSON PERSON                          14446
DATE EVENT                             14300
PERSON DATE                            14223
PERSON ORG                             11964
ORG ORG                                11261
EVENT                                  10618
DATE GPE                               10552
ORG DATE                                9295
WORK_OF_ART                             8606
ORG PERSON

In [36]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 14][:20]

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
2175,14,List_of_NHL_statistical_leaders,102869,List of NHL statistical leaders,ORG
3019,14,2019_World_Junior_Ice_Hockey_Championships,117350,2019 World Junior Ice Hockey Championships,DATE ORG
3411,14,Wayne_Gretzky,97377,Wayne Gretzky,PERSON
3480,14,24Hours,74156,24Hours,
4259,14,IIHF_World_U20_Championship,78838,IIHF World U20 Championship,ORG
4548,14,List_of_Stanley_Cup_champions,73879,List of Stanley Cup champions,EVENT
5399,14,Alexander_Ovechkin,69756,Alexander Ovechkin,PERSON
5408,14,2018_World_Junior_Ice_Hockey_Championships,76122,2018 World Junior Ice Hockey Championships,DATE EVENT
5525,14,Sidney_Crosby,62898,Sidney Crosby,PERSON
5554,14,Spengler_Cup,56905,Spengler Cup,EVENT


##### Lemmatizing and stemming

In [38]:
stemmer = SnowballStemmer("english")

In [39]:
def parse_title(title):
    # words = re.sub('[-():]', " ", title).split()
    words = gensim.utils.simple_preprocess(title)
    stopwords = gensim.parsing.preprocessing.STOPWORDS
    
    parsed=[]
    for word in words:
        if word not in stopwords:
            lemmatized = stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v'))
            parsed.append(lemmatized)
            
    return parsed

In [40]:
print("Started running at", datetime.now(), "UTC")

start_time = timer()

nrows = len(louvain_communities_for_NLP_proc)
print("Num of rows:", nrows)

for i in range(nrows):
    txt = louvain_communities_for_NLP_proc.iloc[i].title
    parsed = parse_title(txt)
    
    louvain_communities_for_NLP_proc.at[str(i), "title_parsed"] = " ".join(parsed)
    
cu.printRunTime(start_time)

louvain_communities_for_NLP_proc.head(20)

Started running at 2019-03-03 03:58:15.950026 UTC
Num of rows: 2729767


Runtime: 12.51 min



Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film
4,1,Main_Page,3051841,Main Page,,main page
5,4,Aquaman_(film),3077395,Aquaman (film),,aquaman film
6,4,Bird_Box,2529969,Bird Box,PERSON,bird box
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG,list dislik youtub video
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG,freddi mercuri


In [41]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(louvain_communities_for_NLP_proc, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl


In [9]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP_proc = pickle.load(picklefile)

louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film
4,1,Main_Page,3051841,Main Page,,main page
5,4,Aquaman_(film),3077395,Aquaman (film),,aquaman film
6,4,Bird_Box,2529969,Bird Box,PERSON,bird box
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG,list dislik youtub video
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG,freddi mercuri


Make community docs with community ids

In [10]:
louvain_communities_for_NLP_proc["scaled_weight"] = np.log(louvain_communities_for_NLP_proc.weight)

In [11]:
louvain_communities_for_NLP_proc.head()

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush,15.55
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa,15.25
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film,15.13
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film,15.19
4,1,Main_Page,3051841,Main Page,,main page,14.93


In [45]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

nrows = len(louvain_communities_for_NLP_proc)
#nrows = 10
print("Num of rows:", nrows)

communities_docs_dict = defaultdict(list)

for i in range(nrows):
    lcomm = louvain_communities_for_NLP_proc.iloc[i].louvain_community
    title_parsed_arr = louvain_communities_for_NLP_proc.iloc[i].title_parsed.split()
    named_entities_arr = louvain_communities_for_NLP_proc.iloc[i].named_entities.split()
    weight = round(louvain_communities_for_NLP_proc.iloc[i].scaled_weight).astype("int64")
    
    communities_docs_dict[lcomm].extend((title_parsed_arr + named_entities_arr) * max(weight, 1))
    
    if (i % 100000 == 0):
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "lcomm=", lcomm, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
    
cu.printRunTime(start_time)

Started running at 2019-03-03 04:15:41.912996 UTC
Num of rows: 2729767
Rows processed: 0.0 %, count =  0
Elapsed time: 0.0 min

Last row's processing vars:
i= 0 lcomm= 3 title_parsed_arr= ['georg', 'bush'] named_entities_arr= ['PERSON'] weight= 16 

Rows processed: 3.6633 %, count =  100000
Elapsed time: 0.9798 min

Last row's processing vars:
i= 100000 lcomm= 10 title_parsed_arr= ['list', 'citi', 'borough', 'pennsylvania', 'popul'] named_entities_arr= ['GPE'] weight= 9 

Rows processed: 7.3266 %, count =  200000
Elapsed time: 1.9618 min

Last row's processing vars:
i= 200000 lcomm= 11 title_parsed_arr= ['frank', 'vogel'] named_entities_arr= ['PERSON'] weight= 8 

Rows processed: 10.9899 %, count =  300000
Elapsed time: 2.9442 min

Last row's processing vars:
i= 300000 lcomm= 3 title_parsed_arr= ['steven', 'berghui'] named_entities_arr= ['PERSON'] weight= 8 

Rows processed: 14.6533 %, count =  400000
Elapsed time: 3.9236 min

Last row's processing vars:
i= 400000 lcomm= 7 title_parsed

Runtime: 26.63 min



In [46]:

count = 0
for k, v in communities_docs_dict.items():
    print("\n", k, "\n", "doc length=", len(v), "\ndoc sample tokens:", v[:10])
    count += 1
    if count > 9:
        break


 0 
 doc length= 536013 
doc sample tokens: ['pink', 'line', 'delhi', 'metro', 'pink', 'line', 'delhi', 'metro', 'pink', 'line']

 1 
 doc length= 3343973 
doc sample tokens: ['main', 'page', 'main', 'page', 'main', 'page', 'main', 'page', 'main', 'page']

 2 
 doc length= 2862788 
doc sample tokens: ['list', 'presid', 'unit', 'state', 'date', 'death', 'GPE', 'list', 'presid', 'unit']

 3 
 doc length= 9786326 
doc sample tokens: ['georg', 'bush', 'PERSON', 'georg', 'bush', 'PERSON', 'georg', 'bush', 'PERSON', 'georg']

 4 
 doc length= 5335393 
doc sample tokens: ['jason', 'momoa', 'PERSON', 'jason', 'momoa', 'PERSON', 'jason', 'momoa', 'PERSON', 'jason']

 5 
 doc length= 4009506 
doc sample tokens: ['list', 'dislik', 'youtub', 'video', 'ORG', 'list', 'dislik', 'youtub', 'video', 'ORG']

 6 
 doc length= 1536687 
doc sample tokens: ['meg', 'johnson', 'poet', 'PERSON', 'meg', 'johnson', 'poet', 'PERSON', 'meg', 'johnson']

 7 
 doc length= 4911364 
doc sample tokens: ['box', 'day', '

In [47]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_communities_docs_dict.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(communities_docs_dict, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_communities_docs_dict.pkl


In [48]:
# unpickle
with open("pickles/en_1218_louvain_communities_docs_dict.pkl", 'rb') as picklefile: 
    communities_docs_dict = pickle.load(picklefile)

count = 0
for k, v in communities_docs_dict.items():
    print("\n", k, "\n", "doc length=", len(v), "\ndoc sample tokens:", v[:10])
    count += 1
    if count > 9:
        break


 0 
 doc length= 536013 
doc sample tokens: ['pink', 'line', 'delhi', 'metro', 'pink', 'line', 'delhi', 'metro', 'pink', 'line']

 1 
 doc length= 3343973 
doc sample tokens: ['main', 'page', 'main', 'page', 'main', 'page', 'main', 'page', 'main', 'page']

 2 
 doc length= 2862788 
doc sample tokens: ['list', 'presid', 'unit', 'state', 'date', 'death', 'GPE', 'list', 'presid', 'unit']

 3 
 doc length= 9786326 
doc sample tokens: ['georg', 'bush', 'PERSON', 'georg', 'bush', 'PERSON', 'georg', 'bush', 'PERSON', 'georg']

 4 
 doc length= 5335393 
doc sample tokens: ['jason', 'momoa', 'PERSON', 'jason', 'momoa', 'PERSON', 'jason', 'momoa', 'PERSON', 'jason']

 5 
 doc length= 4009506 
doc sample tokens: ['list', 'dislik', 'youtub', 'video', 'ORG', 'list', 'dislik', 'youtub', 'video', 'ORG']

 6 
 doc length= 1536687 
doc sample tokens: ['meg', 'johnson', 'poet', 'PERSON', 'meg', 'johnson', 'poet', 'PERSON', 'meg', 'johnson']

 7 
 doc length= 4911364 
doc sample tokens: ['box', 'day', '

In [49]:
# make a list of doc lists (needed for gensim inputs), and a community2doc lookup dict

print("Started running at", datetime.now(), "UTC")
start_time = timer()

community_docs_list = [] # list of doc lists
community2doc = {} # comm id is key and list doc index in community_docs_list is value

i = 0
for k, v in communities_docs_dict.items():
    community_docs_list.append(v)
    community2doc[k] = i
    i += 1

    
cu.printRunTime(start_time)

Started running at 2019-03-03 04:42:30.608266 UTC


Runtime: 0.0 min



In [50]:
len(community_docs_list)

1698

In [None]:
# 1698

#### Bags of words

##### Dataset bag of words

In [51]:
# this is just an id2word mapping (the integer keys here are ids, not counts)
dictionary = gensim.corpora.Dictionary(community_docs_list)

# check results
i = 0
for k, v in dictionary.items():
    print(k, v)
    i += 1
    if i > 5:
        break

152479 myponga
222740 doosri
527118 patroon
60675 mislead
360370 pavlína
524362 mancusi


In [56]:
# Filtering
#dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
dictionary.filter_extremes(no_above=0.1)

In [57]:
# make bags of words for each doc in docs list, forming the corpus input for LDA model

print("Started running at", datetime.now(), "UTC")
start_time = timer()

bow_corpus = [dictionary.doc2bow(doc) for doc in community_docs_list]

cu.printRunTime(start_time)

Started running at 2019-03-03 05:07:43.481396 UTC


Runtime: 0.26 min



In [58]:

document_num = 0
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x[:10])):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("EVENT") appears 468 time.
Word 1 ("FAC") appears 4963 time.
Word 2 ("LANGUAGE") appears 19 time.
Word 3 ("LAW") appears 29 time.
Word 4 ("LOC") appears 2929 time.
Word 5 ("MONEY") appears 24 time.
Word 6 ("ORDINAL") appears 350 time.
Word 7 ("PRODUCT") appears 743 time.
Word 8 ("QUANTITY") appears 36 time.
Word 9 ("TIME") appears 24 time.


In [61]:
# Run LDA multicore

print("Started running at", datetime.now(), "UTC")
start_time = timer()

lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 20,
                                   workers = 4)


cu.printRunTime(start_time)

Started running at 2019-03-03 05:17:45.023803 UTC


Runtime: 2.29 min



In [62]:

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.061*"EVENT" + 0.039*"championship" + 0.031*"olymp" + 0.029*"world" + 0.022*"summer" + 0.019*"men" + 0.016*"game" + 0.014*"hockey" + 0.013*"women" + 0.012*"QUANTITY"


Topic: 1 
Words: 0.043*"song" + 0.014*"band" + 0.009*"discographi" + 0.009*"love" + 0.008*"music" + 0.007*"live" + 0.007*"WORK_OF_ART" + 0.005*"musician" + 0.005*"record" + 0.005*"tour"


Topic: 2 
Words: 0.025*"station" + 0.023*"railway" + 0.022*"airport" + 0.018*"FAC" + 0.015*"class" + 0.015*"air" + 0.013*"uss" + 0.010*"cricket" + 0.008*"rugbi" + 0.008*"hms"


Topic: 3 
Words: 0.014*"state" + 0.013*"new" + 0.012*"counti" + 0.009*"LOC" + 0.008*"school" + 0.008*"unit" + 0.008*"FAC" + 0.007*"california" + 0.007*"york" + 0.006*"station"


Topic: 4 
Words: 0.006*"disambigu" + 0.005*"LOC" + 0.005*"engin" + 0.004*"seri" + 0.004*"softwar" + 0.004*"PRODUCT" + 0.004*"station" + 0.004*"power" + 0.003*"grand" + 0.003*"prix"


Topic: 5 
Words: 0.037*"footbal" + 0.028*"team" + 0.026*"season" + 0.018*"school" + 0.01

Let's try looking at the topics individually: a given topic is the dataset, and a parsed article title is a doc.

In [12]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 11].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
74,11,Patrick_Mahomes,669193,Patrick Mahomes,PERSON,patrick mahom,13.41
78,11,Kyler_Murray,630603,Kyler Murray,PERSON,kyler murray,13.35
103,11,LeBron_James,621114,LeBron James,PERSON,lebron jam,13.34
110,11,Philip_Rivers,544167,Philip Rivers,PERSON,philip river,13.21
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15


In [13]:
test_community = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 11].copy()

In [14]:
test_community.describe()

Unnamed: 0,louvain_community,weight,scaled_weight
count,110397.0,110397.0,110397.0
mean,11.0,1584.24,-inf
std,0.0,9576.18,
min,11.0,0.0,-inf
25%,11.0,62.0,4.13
50%,11.0,171.0,5.14
75%,11.0,609.0,6.41
max,11.0,669193.0,13.41


In [16]:
test_community[(test_community.named_entities != "PERSON")].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15
252,11,2019_College_Football_Playoff_National_Championship,367824,2019 College Football Playoff National Championship,DATE,colleg footbal playoff nation championship,12.82
296,11,Urban_Meyer,326946,Urban Meyer,,urban meyer,12.7
312,11,College_Football_Playoff,346983,College Football Playoff,,colleg footbal playoff,12.76
323,11,List_of_Super_Bowl_champions,350514,List of Super Bowl champions,EVENT,list super bowl champion,12.77


In [17]:
test_community = test_community[(test_community.named_entities != "PERSON")]

In [36]:
len(test_community)

62724

In [18]:
test_community.head()

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15
252,11,2019_College_Football_Playoff_National_Championship,367824,2019 College Football Playoff National Championship,DATE,colleg footbal playoff nation championship,12.82
296,11,Urban_Meyer,326946,Urban Meyer,,urban meyer,12.7
312,11,College_Football_Playoff,346983,College Football Playoff,,colleg footbal playoff,12.76
323,11,List_of_Super_Bowl_champions,350514,List of Super Bowl champions,EVENT,list super bowl champion,12.77


In [77]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

nrows = len(test_community)
# nrows = 10
print("Num of rows:", nrows)

article_docs = []

for i in range(nrows):
    title_parsed_arr = test_community.iloc[i].title_parsed.split()
    named_entities_arr = test_community.iloc[i].named_entities.split()
    weight = max(round(test_community.iloc[i].scaled_weight).astype("int64"), 1)
    
    # article_docs.append((title_parsed_arr + named_entities_arr) * weight)
    for i in range(weight):
        article_docs.append(title_parsed_arr + named_entities_arr)
    #article_docs.append(title_parsed_arr + named_entities_arr)
    
    
    if (i % 10000 == 0):
        print("Rows processed:", round(i * 100/len(test_community), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
    
cu.printRunTime(start_time)


Started running at 2019-03-03 18:49:19.305972 UTC
Num of rows: 62724
Rows processed: 0.0 %, count =  0
Elapsed time: 0.4626 min

Last row's processing vars:
i= 0 title_parsed_arr= ['buffalo', 'american', 'season'] named_entities_arr= ['CARDINAL', 'GPE', 'NORP'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4629 min

Last row's processing vars:
i= 0 title_parsed_arr= ['yale', 'bulldog', 'footbal', 'team'] named_entities_arr= ['DATE', 'ORG'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4631 min

Last row's processing vars:
i= 0 title_parsed_arr= ['atlant', 'sun'] named_entities_arr= ['ORG'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4632 min

Last row's processing vars:
i= 0 title_parsed_arr= ['detroit', 'tiger', 'season'] named_entities_arr= ['DATE'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4633 min

Last row's processing vars:
i= 0 title_parsed_arr= ['stauska'] named_entities_arr= [] weight= 1 

Rows processed: 0.

Runtime: 0.53 min



In [78]:
# this is just an id2word mapping (the integer keys here are ids, not counts)
art_dictionary = gensim.corpora.Dictionary(article_docs)

# check results
i = 0
for k, v in art_dictionary.iteritems():
    print(k, v)
    i += 1
    if i > 5:
        break

9265 daz
13082 ljubljana
323 cardin
14173 paok
18622 plaster
2174 glove


In [79]:
# check results
i = 0
for k, v in art_dictionary.iteritems():
    print(k, v)
    print("freq=", art_dictionary.dfs[k])
    i += 1
    if i > 5:
        break

9265 daz
freq= 6
13082 ljubljana
freq= 8
323 cardin
freq= 2134
14173 paok
freq= 4
18622 plaster
freq= 3
2174 glove
freq= 58


In [72]:
# check results
i = 0
for k, v in art_dictionary.iteritems():
    print(k, v)
    print("freq=", art_dictionary.dfs[k])
    i += 1
    if i > 5:
        break

9265 daz
freq= 1
13082 ljubljana
freq= 2
323 cardin
freq= 407
14173 paok
freq= 1
18622 plaster
freq= 1
2174 glove
freq= 9


In [57]:
type(art_dictionary.dfs)

dict

In [85]:
top_freq_word_ids = sorted(art_dictionary.dfs, key=art_dictionary.dfs.__getitem__, reverse=True)[:20]
top_freq_word_ids[:5]

[0, 5, 38, 2, 329]

In [86]:
for idx in top_freq_word_ids:
    print("token=", art_dictionary[idx], "\tfreq=", art_dictionary.dfs[idx])

token= ORG 	freq= 147752
token= DATE 	freq= 90967
token= GPE 	freq= 68997
token= footbal 	freq= 67015
token= team 	freq= 46758
token= season 	freq= 38095
token= PERSON 	freq= 31308
token= school 	freq= 27011
token= NORP 	freq= 21380
token= basketbal 	freq= 21021
token= high 	freq= 20430
token= list 	freq= 18015
token= american 	freq= 17992
token= EVENT 	freq= 16458
token= CARDINAL 	freq= 15315
token= state 	freq= 14340
token= leagu 	freq= 14116
token= bowl 	freq= 11316
token= men 	freq= 11030
token= basebal 	freq= 9589


In [46]:
round(len(article_docs)*0.5)

31362

In [47]:
# Filtering
#dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
#art_dictionary.filter_extremes(no_above=0.1)
#art_dictionary.filter_extremes(no_below=10)
art_dictionary.filter_extremes(keep_n=round(len(article_docs)*0.5))

In [48]:
# make bags of words for each doc in docs list, forming the corpus input for LDA model

print("Started running at", datetime.now(), "UTC")
start_time = timer()

art_bow_corpus = [art_dictionary.doc2bow(doc) for doc in article_docs]

cu.printRunTime(start_time)

Started running at 2019-03-03 18:12:34.019022 UTC


Runtime: 0.01 min



In [49]:

document_num = 0
bow_doc_x = art_bow_corpus[document_num]

for i in range(len(bow_doc_x[:10])):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     art_dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("ORG") appears 1 time.
Word 1 ("bowl") appears 1 time.
Word 2 ("footbal") appears 1 time.
Word 3 ("game") appears 1 time.
Word 4 ("ncaa") appears 1 time.


In [50]:
# Run LDA multicore

print("Started running at", datetime.now(), "UTC")
start_time = timer()

lda_model =  gensim.models.LdaMulticore(art_bow_corpus, 
                                   num_topics = 3, 
                                   id2word = art_dictionary,                                    
                                   passes = 20,
                                   workers = 4)


cu.printRunTime(start_time)

Started running at 2019-03-03 18:12:44.185875 UTC


Runtime: 2.46 min



In [51]:
# ntopics = 3
for idx, topic in lda_model.print_topics(num_topics=-1, num_words=20):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.258*"GPE" + 0.128*"ORG" + 0.082*"school" + 0.056*"high" + 0.016*"FAC" + 0.016*"district" + 0.014*"stadium" + 0.013*"list" + 0.013*"citi" + 0.011*"fc" + 0.010*"sport" + 0.010*"north" + 0.010*"ohio" + 0.010*"carolina" + 0.010*"counti" + 0.009*"south" + 0.009*"field" + 0.009*"kansa" + 0.008*"wildcat" + 0.008*"LOC"


Topic: 1 
Words: 0.101*"DATE" + 0.088*"PERSON" + 0.068*"ORG" + 0.064*"footbal" + 0.056*"NORP" + 0.053*"EVENT" + 0.042*"american" + 0.042*"leagu" + 0.029*"basketbal" + 0.029*"basebal" + 0.025*"bowl" + 0.019*"cup" + 0.019*"championship" + 0.019*"nation" + 0.016*"list" + 0.016*"women" + 0.014*"game" + 0.013*"fiba" + 0.012*"divis" + 0.012*"ncaa"


Topic: 2 
Words: 0.167*"ORG" + 0.139*"DATE" + 0.113*"team" + 0.097*"footbal" + 0.075*"season" + 0.030*"CARDINAL" + 0.027*"state" + 0.017*"basketbal" + 0.013*"men" + 0.009*"new" + 0.008*"tiger" + 0.008*"univers" + 0.008*"colleg" + 0.006*"york" + 0.006*"bulldog" + 0.005*"nfl" + 0.005*"golden" + 0.005*"chicago" + 0.005*"g