In [1]:
import pandas as pd
import numpy as np

import re

from py2neo import authenticate, Graph, Node, Relationship


import os
import csv
import pickle

from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from IPython.display import display, HTML

# custom general helper functions for this project
import custom_utils as cu
import importlib


In [2]:
from collections import defaultdict

In [3]:
# Note: not sure why, but on running this gensim import the kernel kept dying.
# Running the following in command line fixed it:
# conda install -f numpy
import gensim

In [4]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/arinai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# reload imports as needed
importlib.reload(cu);

In [7]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

In [8]:
pd.options.display.max_colwidth = 100

In [59]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP = pickle.load(picklefile)

louvain_communities_for_NLP.head(20)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page
5,2634665.0,408421.0,4,34309.0,Aquaman_(film)
6,2328884.0,200893.0,4,192.0,Bird_Box
7,2231176.0,575481.0,3,3945.0,Priyanka_Chopra
8,2226602.0,117115.0,5,958.0,List_of_most-disliked_YouTube_videos
9,2050628.0,336621.0,5,4161.0,Freddie_Mercury


In [60]:
len(louvain_communities_for_NLP)

2729767

##### Pre-process article titles

In [61]:
louvain_communities_for_NLP_proc =louvain_communities_for_NLP.copy()
louvain_communities_for_NLP_proc.rename(index=str,columns={'title':'title_raw'}, inplace=True)
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title_raw
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page


In [63]:
# the importance of words will be weighted by search and link traffic
louvain_communities_for_NLP_proc["weight"] = louvain_communities_for_NLP_proc[[
        "external_search_traffic", "link_in_traffic", "search_in_traffic"]].sum(axis=1).astype('int64').fillna(0)

louvain_communities_for_NLP_proc.drop(["external_search_traffic", "link_in_traffic", "search_in_traffic"], 
                                      axis = 1,
                                      inplace=True)

In [64]:
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight
0,3,George_H._W._Bush,5690673
1,4,Jason_Momoa,4183872
2,9,2.0_(film),3722311
3,4,Bird_Box_(film),3945404
4,1,Main_Page,3051841


In [66]:
# clean up the title
louvain_communities_for_NLP_proc["title"] = \
    louvain_communities_for_NLP_proc.title_raw.str.replace('_', ' ')
    
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight,title
0,3,George_H._W._Bush,5690673,George H. W. Bush
1,4,Jason_Momoa,4183872,Jason Momoa
2,9,2.0_(film),3722311,2.0 (film)
3,4,Bird_Box_(film),3945404,Bird Box (film)
4,1,Main_Page,3051841,Main Page


In [68]:
louvain_communities_for_NLP_proc.describe()

Unnamed: 0,louvain_community,weight
count,2729767.0,2729767.0
mean,11.48,1731.93
std,64.77,13116.59
min,0.0,0.0
25%,3.0,66.0
50%,5.0,200.0
75%,10.0,735.0
max,1697.0,5690673.0


In [29]:
# Use spaCy to get Named Entities

# installation instructions: https://spacy.io/usage/

import spacy
nlp = spacy.load('en', disable=['parser', 'tagger'])

In [30]:
nlp.pipeline

[('ner', <spacy.pipeline.EntityRecognizer at 0x7ff34717b780>)]

In [None]:
print("Started running at", datetime.now(), "UTC")

start_time = timer()

nrows = len(louvain_communities_for_NLP_proc)
print("Total number of rows to process:", nrows, "\n")

for i in range(nrows):
    txt = louvain_communities_for_NLP_proc.iloc[i].title

    doc = nlp(txt)

    ents_arr = []
    for ent in doc.ents:
        ents_arr.append(ent.label_)
    
    louvain_communities_for_NLP_proc.at[str(i), "named_entities"] = " ".join(ents_arr)
    
    if (len(louvain_communities_for_NLP_proc) > nrows):
        print("ERROR!! Dataset length has increased.")
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "txt=", txt, "ents_arr=", ents_arr, "\n",
              "updated data row:\n", louvain_communities_for_NLP_proc.iloc[i], "\n")
        break
    
    if (i % 100000 == 0):
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "txt=", txt, "ents_arr=", ents_arr, "\n",
              "updated data row:\n", louvain_communities_for_NLP_proc.iloc[i], "\n")
    

cu.printRunTime(start_time)

Started running at 2019-03-01 19:57:26.153032 UTC
Total number of rows to process: 2729767 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.0001 min

Last row's processing vars:
i= 0 txt= George H. W. Bush ents_arr= ['PERSON'] 
 updated data row:
 louvain_community                    3
title_raw            George_H._W._Bush
weight                         5690673
title                George H. W. Bush
named_entities                  PERSON
Name: 0, dtype: object 

Rows processed: 3.6633 %, count =  100000
Elapsed time: 6.8751 min

Last row's processing vars:
i= 100000 txt= List of cities and boroughs in Pennsylvania by population ents_arr= ['GPE'] 
 updated data row:
 louvain_community                                                           10
title_raw            List_of_cities_and_boroughs_in_Pennsylvania_by_population
weight                                                                    5628
title                List of cities and boroughs in Pennsylvania by population
named

The code above took a few hours to run, and finished successfully, but the browser tunnel got interrupted at some point, so the print statements above is incomplete, but the dataset has been successfully populated with named_entities.

In [86]:
nrows

2729767

In [85]:
i

2729766

In [87]:
txt

'Christgau (disambiguation)'

In [93]:
louvain_communities_for_NLP_proc.iloc[i]

louvain_community                             5
title_raw            Christgau_(disambiguation)
weight                                       18
title                Christgau (disambiguation)
named_entities                              GPE
Name: 2729766, dtype: object

In [89]:
louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,PERSON
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [90]:
louvain_communities_for_NLP_proc.tail(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
2729747,16,Mantle_of_Luís_I,15,Mantle of Luís I,
2729748,13,List_of_Zero:_Black_Blood_episodes,14,List of Zero: Black Blood episodes,CARDINAL EVENT
2729749,22,Moskvitch_404_Sport,15,Moskvitch 404 Sport,CARDINAL
2729750,3,Caridina_loehae,11,Caridina loehae,GPE
2729751,3,Tatu_Miettunen,11,Tatu Miettunen,
2729752,2,HIST2H3C,10,HIST2H3C,
2729753,6,"Muk,_Iran",17,"Muk, Iran",GPE GPE
2729754,10,"Wila,_Missouri",67,"Wila, Missouri",PERSON GPE
2729755,3,Isaac_Rochussen,11,Isaac Rochussen,PERSON
2729756,7,Raffaello_Bertieri,26,Raffaello Bertieri,PERSON


The named entities assignment is not perfect, for example, both "Main Page" and "Bird Box" articles were labeled "PERSON", but in many cases it did quite well.  
The "Main Page" article seems to be an important node in the network, so let's fix its named_entities value.

In [95]:
louvain_communities_for_NLP_proc.iloc[4]

louvain_community            1
title_raw            Main_Page
weight                 3051841
title                Main Page
named_entities          PERSON
Name: 4, dtype: object

In [103]:
louvain_communities_for_NLP_proc.at[str(4), "named_entities"] = ''

In [104]:
louvain_communities_for_NLP_proc.iloc[4]

louvain_community            1
title_raw            Main_Page
weight                 3051841
title                Main Page
named_entities                
Name: 4, dtype: object

In [106]:
louvain_communities_for_NLP_proc.head(10)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [109]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_communities_for_NLP_proc.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(louvain_communities_for_NLP_proc, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_communities_for_NLP_proc.pkl


In [37]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP_proc.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP_proc = pickle.load(picklefile)

louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON
2,9,2.0_(film),3722311,2.0 (film),CARDINAL
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON
4,1,Main_Page,3051841,Main Page,
5,4,Aquaman_(film),3077395,Aquaman (film),
6,4,Bird_Box,2529969,Bird Box,PERSON
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG


In [30]:
louvain_communities_for_NLP_proc.named_entities.value_counts()

PERSON                                830645
                                      814109
ORG                                   390291
GPE                                   170318
DATE                                   46268
NORP                                   38910
GPE GPE                                38162
CARDINAL                               29662
ORG GPE                                28779
PERSON GPE                             21082
LOC                                    16358
DATE ORG                               15102
FAC                                    14702
PERSON PERSON                          14446
DATE EVENT                             14300
PERSON DATE                            14223
PERSON ORG                             11964
ORG ORG                                11261
EVENT                                  10618
DATE GPE                               10552
ORG DATE                                9295
WORK_OF_ART                             8606
ORG PERSON

In [36]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 14][:20]

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities
2175,14,List_of_NHL_statistical_leaders,102869,List of NHL statistical leaders,ORG
3019,14,2019_World_Junior_Ice_Hockey_Championships,117350,2019 World Junior Ice Hockey Championships,DATE ORG
3411,14,Wayne_Gretzky,97377,Wayne Gretzky,PERSON
3480,14,24Hours,74156,24Hours,
4259,14,IIHF_World_U20_Championship,78838,IIHF World U20 Championship,ORG
4548,14,List_of_Stanley_Cup_champions,73879,List of Stanley Cup champions,EVENT
5399,14,Alexander_Ovechkin,69756,Alexander Ovechkin,PERSON
5408,14,2018_World_Junior_Ice_Hockey_Championships,76122,2018 World Junior Ice Hockey Championships,DATE EVENT
5525,14,Sidney_Crosby,62898,Sidney Crosby,PERSON
5554,14,Spengler_Cup,56905,Spengler Cup,EVENT


##### Lemmatizing and stemming

In [51]:
stemmer = SnowballStemmer("english")

In [39]:
def parse_title(title):
    # words = re.sub('[-():]', " ", title).split()
    words = gensim.utils.simple_preprocess(title)
    stopwords = gensim.parsing.preprocessing.STOPWORDS
    
    parsed=[]
    for word in words:
        if word not in stopwords:
            lemmatized = stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v'))
            parsed.append(lemmatized)
            
    return parsed

In [40]:
print("Started running at", datetime.now(), "UTC")

start_time = timer()

nrows = len(louvain_communities_for_NLP_proc)
print("Num of rows:", nrows)

for i in range(nrows):
    txt = louvain_communities_for_NLP_proc.iloc[i].title
    parsed = parse_title(txt)
    
    louvain_communities_for_NLP_proc.at[str(i), "title_parsed"] = " ".join(parsed)
    
cu.printRunTime(start_time)

louvain_communities_for_NLP_proc.head(20)

Started running at 2019-03-03 03:58:15.950026 UTC
Num of rows: 2729767


Runtime: 12.51 min



Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film
4,1,Main_Page,3051841,Main Page,,main page
5,4,Aquaman_(film),3077395,Aquaman (film),,aquaman film
6,4,Bird_Box,2529969,Bird Box,PERSON,bird box
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG,list dislik youtub video
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG,freddi mercuri


In [41]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(louvain_communities_for_NLP_proc, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl


In [9]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP_proc_2.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP_proc = pickle.load(picklefile)

louvain_communities_for_NLP_proc.head(20)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film
4,1,Main_Page,3051841,Main Page,,main page
5,4,Aquaman_(film),3077395,Aquaman (film),,aquaman film
6,4,Bird_Box,2529969,Bird Box,PERSON,bird box
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra
8,5,List_of_most-disliked_YouTube_videos,2344675,List of most-disliked YouTube videos,ORG,list dislik youtub video
9,5,Freddie_Mercury,2391410,Freddie Mercury,ORG,freddi mercuri


In [19]:
louvain_communities_for_NLP_proc["scaled_weight"] = np.log(louvain_communities_for_NLP_proc.weight)

In [11]:
louvain_communities_for_NLP_proc.head()

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush,15.55
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa,15.25
2,9,2.0_(film),3722311,2.0 (film),CARDINAL,film,15.13
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film,15.19
4,1,Main_Page,3051841,Main Page,,main page,14.93


#### Bags of words

##### Dataset bag of words

Let's try looking at the topics individually: a given topic is the dataset, and a parsed article title is a doc.

In [12]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 11].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
74,11,Patrick_Mahomes,669193,Patrick Mahomes,PERSON,patrick mahom,13.41
78,11,Kyler_Murray,630603,Kyler Murray,PERSON,kyler murray,13.35
103,11,LeBron_James,621114,LeBron James,PERSON,lebron jam,13.34
110,11,Philip_Rivers,544167,Philip Rivers,PERSON,philip river,13.21
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15


In [13]:
test_community = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 11].copy()

In [14]:
test_community.describe()

Unnamed: 0,louvain_community,weight,scaled_weight
count,110397.0,110397.0,110397.0
mean,11.0,1584.24,-inf
std,0.0,9576.18,
min,11.0,0.0,-inf
25%,11.0,62.0,4.13
50%,11.0,171.0,5.14
75%,11.0,609.0,6.41
max,11.0,669193.0,13.41


In [16]:
test_community[(test_community.named_entities != "PERSON")].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15
252,11,2019_College_Football_Playoff_National_Championship,367824,2019 College Football Playoff National Championship,DATE,colleg footbal playoff nation championship,12.82
296,11,Urban_Meyer,326946,Urban Meyer,,urban meyer,12.7
312,11,College_Football_Playoff,346983,College Football Playoff,,colleg footbal playoff,12.76
323,11,List_of_Super_Bowl_champions,350514,List of Super Bowl champions,EVENT,list super bowl champion,12.77


In [17]:
test_community = test_community[(test_community.named_entities != "PERSON")]

In [36]:
len(test_community)

62724

In [18]:
test_community.head()

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
120,11,2018–19_NCAA_football_bowl_games,514981,2018–19 NCAA football bowl games,ORG,ncaa footbal bowl game,13.15
252,11,2019_College_Football_Playoff_National_Championship,367824,2019 College Football Playoff National Championship,DATE,colleg footbal playoff nation championship,12.82
296,11,Urban_Meyer,326946,Urban Meyer,,urban meyer,12.7
312,11,College_Football_Playoff,346983,College Football Playoff,,colleg footbal playoff,12.76
323,11,List_of_Super_Bowl_champions,350514,List of Super Bowl champions,EVENT,list super bowl champion,12.77


In [77]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

nrows = len(test_community)
# nrows = 10
print("Num of rows:", nrows)

article_docs = []

for i in range(nrows):
    title_parsed_arr = test_community.iloc[i].title_parsed.split()
    named_entities_arr = test_community.iloc[i].named_entities.split()
    weight = max(round(test_community.iloc[i].scaled_weight).astype("int64"), 1)
    
    # article_docs.append((title_parsed_arr + named_entities_arr) * weight)
    for i in range(weight):
        article_docs.append(title_parsed_arr + named_entities_arr)
    #article_docs.append(title_parsed_arr + named_entities_arr)
    
    
    if (i % 10000 == 0):
        print("Rows processed:", round(i * 100/len(test_community), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
    
cu.printRunTime(start_time)


Started running at 2019-03-03 18:49:19.305972 UTC
Num of rows: 62724
Rows processed: 0.0 %, count =  0
Elapsed time: 0.4626 min

Last row's processing vars:
i= 0 title_parsed_arr= ['buffalo', 'american', 'season'] named_entities_arr= ['CARDINAL', 'GPE', 'NORP'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4629 min

Last row's processing vars:
i= 0 title_parsed_arr= ['yale', 'bulldog', 'footbal', 'team'] named_entities_arr= ['DATE', 'ORG'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4631 min

Last row's processing vars:
i= 0 title_parsed_arr= ['atlant', 'sun'] named_entities_arr= ['ORG'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4632 min

Last row's processing vars:
i= 0 title_parsed_arr= ['detroit', 'tiger', 'season'] named_entities_arr= ['DATE'] weight= 1 

Rows processed: 0.0 %, count =  0
Elapsed time: 0.4633 min

Last row's processing vars:
i= 0 title_parsed_arr= ['stauska'] named_entities_arr= [] weight= 1 

Rows processed: 0.

Runtime: 0.53 min



In [78]:
# this is just an id2word mapping (the integer keys here are ids, not counts)
art_dictionary = gensim.corpora.Dictionary(article_docs)

# check results
i = 0
for k, v in art_dictionary.iteritems():
    print(k, v)
    print("freq=", art_dictionary.dfs[k])
    i += 1
    if i > 5:
        break

9265 daz
13082 ljubljana
323 cardin
14173 paok
18622 plaster
2174 glove


In [86]:
top_freq_word_ids = sorted(art_dictionary.dfs, key=art_dictionary.dfs.__getitem__, reverse=True)[:20]

for idx in top_freq_word_ids:
    print("token=", art_dictionary[idx], "\tfreq=", art_dictionary.dfs[idx])

token= ORG 	freq= 147752
token= DATE 	freq= 90967
token= GPE 	freq= 68997
token= footbal 	freq= 67015
token= team 	freq= 46758
token= season 	freq= 38095
token= PERSON 	freq= 31308
token= school 	freq= 27011
token= NORP 	freq= 21380
token= basketbal 	freq= 21021
token= high 	freq= 20430
token= list 	freq= 18015
token= american 	freq= 17992
token= EVENT 	freq= 16458
token= CARDINAL 	freq= 15315
token= state 	freq= 14340
token= leagu 	freq= 14116
token= bowl 	freq= 11316
token= men 	freq= 11030
token= basebal 	freq= 9589


Let's try another test community

In [87]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 3].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush,15.55
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra,14.85
10,3,Zero_(2018_film),2045050,Zero (2018 film),CARDINAL DATE,zero film,14.53
11,3,George_W._Bush,2584078,George W. Bush,PERSON,georg bush,14.76
16,3,Jimmy_Carter,1815966,Jimmy Carter,PERSON,jimmi carter,14.41


In [102]:
test_community2 = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 3].copy()

In [103]:
len(test_community2)

507642

In [90]:
#test_community2 = test_community2[(test_community2.named_entities != "PERSON")]

In [91]:
#len(test_community2)

329089

In [107]:
test_community2.head(10)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
0,3,George_H._W._Bush,5690673,George H. W. Bush,PERSON,georg bush,15.55
7,3,Priyanka_Chopra,2810602,Priyanka Chopra,PERSON,priyanka chopra,14.85
10,3,Zero_(2018_film),2045050,Zero (2018 film),CARDINAL DATE,zero film,14.53
11,3,George_W._Bush,2584078,George W. Bush,PERSON,georg bush,14.76
16,3,Jimmy_Carter,1815966,Jimmy Carter,PERSON,jimmi carter,14.41
18,3,List_of_highest-grossing_Indian_films,1580445,List of highest-grossing Indian films,NORP,list highest gross indian film,14.27
19,3,"Mary,_Queen_of_Scots",1364237,"Mary, Queen of Scots",,mari queen scot,14.13
20,3,Ole_Gunnar_Solskjær,1215892,Ole Gunnar Solskjær,,ole gunnar solskjær,14.01
21,3,Mukesh_Ambani,1415302,Mukesh Ambani,PERSON,mukesh ambani,14.16
22,3,Miguel_Ángel_Félix_Gallardo,1534598,Miguel Ángel Félix Gallardo,PERSON,miguel ángel félix gallardo,14.24


In [120]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

nrows = len(test_community2)
# nrows = 10
print("Num of rows:", nrows)

#article_docs = []
title_docs = []
ner_docs = []

last_i = 0

for i in range(nrows):
    title_parsed_arr = test_community2.iloc[i].title_parsed.split()
    named_entities_arr = test_community2.iloc[i].named_entities.split()
    weight = max(round(test_community2.iloc[i].scaled_weight).astype("int64"), 1)
    
    # article_docs.append((title_parsed_arr + named_entities_arr) * weight)
    for j in range(weight):
        #article_docs.append(title_parsed_arr + named_entities_arr)
        title_docs.append(title_parsed_arr)
        ner_docs.append(named_entities_arr)
    #article_docs.append(title_parsed_arr + named_entities_arr)
    
    
    if ( (i % 100000 == 0) and (i != last_i) ):
        print("Rows processed:", round(i * 100/len(test_community2), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
        last_i = i
    
cu.printRunTime(start_time)


Started running at 2019-03-03 19:42:08.502904 UTC
Num of rows: 507642
Rows processed: 19.6989 %, count =  100000
Elapsed time: 0.7801 min

Last row's processing vars:
i= 100000 title_parsed_arr= ['kadir', 'mısıroğlu'] named_entities_arr= ['ORG'] weight= 6 

Rows processed: 39.3978 %, count =  200000
Elapsed time: 1.67 min

Last row's processing vars:
i= 200000 title_parsed_arr= ['key', 'yes', 'prime', 'minist'] named_entities_arr= [] weight= 6 

Rows processed: 59.0968 %, count =  300000
Elapsed time: 2.4591 min

Last row's processing vars:
i= 300000 title_parsed_arr= ['america', 'initi'] named_entities_arr= ['CARDINAL', 'GPE'] weight= 4 

Rows processed: 78.7957 %, count =  400000
Elapsed time: 3.2543 min

Last row's processing vars:
i= 400000 title_parsed_arr= ['afghan', 'morphin'] named_entities_arr= ['NORP'] weight= 5 

Rows processed: 98.4946 %, count =  500000
Elapsed time: 4.0397 min

Last row's processing vars:
i= 500000 title_parsed_arr= ['search', 'foundat'] named_entities_ar

Runtime: 4.1 min



In [121]:
# this is just an id2word mapping (the integer keys here are ids, not counts)
titles_dictionary = gensim.corpora.Dictionary(title_docs)

# check results
i = 0
for k, v in titles_dictionary.iteritems():
    print(k, v)
    print("freq=", titles_dictionary.dfs[k])
    i += 1
    if i > 5:
        break

108460 dimashqi
freq= 8
152138 aberach
freq= 3
31534 epn
freq= 7
148680 vojta
freq= 4
18912 andimuthu
freq= 8
1627 delaney
freq= 69


In [122]:
top_freq_word_ids = sorted(titles_dictionary.dfs, key=titles_dictionary.dfs.__getitem__, reverse=True)[:20]

for idx in top_freq_word_ids:
    print("token=", titles_dictionary[idx], "\tfreq=", titles_dictionary.dfs[idx])

token= list 	freq= 86275
token= unit 	freq= 62493
token= state 	freq= 55433
token= footbal 	freq= 49587
token= elect 	freq= 41775
token= nation 	freq= 39424
token= th 	freq= 30185
token= constitu 	freq= 29297
token= parti 	freq= 26282
token= battl 	freq= 23894
token= john 	freq= 23292
token= film 	freq= 22708
token= st 	freq= 22624
token= district 	freq= 20653
token= war 	freq= 19766
token= cup 	freq= 17757
token= disambigu 	freq= 16889
token= india 	freq= 16388
token= leagu 	freq= 15638
token= world 	freq= 15583


In [123]:
# this is just an id2word mapping (the integer keys here are ids, not counts)
ner_dictionary = gensim.corpora.Dictionary(ner_docs)

# check results
i = 0
for k, v in ner_dictionary.iteritems():
    print(k, v)
    print("freq=", ner_dictionary.dfs[k])
    i += 1
    if i > 5:
        break

0 PERSON
freq= 1088529
11 ORDINAL
freq= 30215
3 NORP
freq= 112900
10 LOC
freq= 28239
4 GPE
freq= 426692
8 LANGUAGE
freq= 2923


In [124]:
top_freq_word_ids = sorted(ner_dictionary.dfs, key=ner_dictionary.dfs.__getitem__, reverse=True)[:5]

for idx in top_freq_word_ids:
    print("token=", ner_dictionary[idx], "\tfreq=", ner_dictionary.dfs[idx])

token= PERSON 	freq= 1088529
token= ORG 	freq= 559783
token= GPE 	freq= 426692
token= DATE 	freq= 171238
token= NORP 	freq= 112900


Let's try another community

In [117]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 4].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
1,4,Jason_Momoa,4183872,Jason Momoa,PERSON,jason momoa,15.25
3,4,Bird_Box_(film),3945404,Bird Box (film),PERSON,bird box film,15.19
5,4,Aquaman_(film),3077395,Aquaman (film),,aquaman film,14.94
6,4,Bird_Box,2529969,Bird Box,PERSON,bird box,14.74
14,4,Spider-Man:_Into_the_Spider-Verse,1840563,Spider-Man: Into the Spider-Verse,,spider man spider vers,14.43


In [126]:
test_community3 = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 4].copy()

In [127]:
len(test_community3)

267541

In [128]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

comm = test_community3

nrows = len(comm)
# nrows = 10
print("Num of rows:", nrows)

#article_docs = []
title_docs = []
ner_docs = []

last_i = 0

for i in range(nrows):
    title_parsed_arr = comm.iloc[i].title_parsed.split()
    named_entities_arr = comm.iloc[i].named_entities.split()
    weight = max(round(comm.iloc[i].scaled_weight).astype("int64"), 1)
    
    # article_docs.append((title_parsed_arr + named_entities_arr) * weight)
    for j in range(weight):
        #article_docs.append(title_parsed_arr + named_entities_arr)
        title_docs.append(title_parsed_arr)
        ner_docs.append(named_entities_arr)
    #article_docs.append(title_parsed_arr + named_entities_arr)
    
    
    if ( (i % 100000 == 0) and (i != last_i) ):
        print("Rows processed:", round(i * 100/len(comm), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
        last_i = i
    
cu.printRunTime(start_time)


Started running at 2019-03-03 19:49:38.097522 UTC
Num of rows: 267541
Rows processed: 37.3774 %, count =  100000
Elapsed time: 0.763 min

Last row's processing vars:
i= 100000 title_parsed_arr= ['list', 'barnard', 'colleg', 'peopl'] named_entities_arr= ['ORG'] weight= 6 

Rows processed: 74.7549 %, count =  200000
Elapsed time: 1.5333 min

Last row's processing vars:
i= 200000 title_parsed_arr= ['brat'] named_entities_arr= ['PERSON'] weight= 5 



Runtime: 2.08 min



In [129]:
titles_dictionary = gensim.corpora.Dictionary(title_docs)


In [130]:
top_freq_word_ids = sorted(titles_dictionary.dfs, key=titles_dictionary.dfs.__getitem__, reverse=True)[:20]

for idx in top_freq_word_ids:
    print("token=", titles_dictionary[idx], "\tfreq=", titles_dictionary.dfs[idx])

token= film 	freq= 163619
token= list 	freq= 65412
token= seri 	freq= 46817
token= tv 	freq= 41297
token= actor 	freq= 23137
token= comic 	freq= 22412
token= season 	freq= 20305
token= award 	freq= 19508
token= disambigu 	freq= 18692
token= episod 	freq= 17019
token= school 	freq= 13981
token= john 	freq= 13645
token= charact 	freq= 13494
token= novel 	freq= 12723
token= man 	freq= 12648
token= star 	freq= 11155
token= love 	freq= 11142
token= song 	freq= 11061
token= album 	freq= 10191
token= david 	freq= 9642


In [131]:
ner_dictionary = gensim.corpora.Dictionary(ner_docs)


In [132]:
top_freq_word_ids = sorted(ner_dictionary.dfs, key=ner_dictionary.dfs.__getitem__, reverse=True)[:5]

for idx in top_freq_word_ids:
    print("token=", ner_dictionary[idx], "\tfreq=", ner_dictionary.dfs[idx])

token= PERSON 	freq= 731003
token= ORG 	freq= 229863
token= DATE 	freq= 125036
token= GPE 	freq= 98258
token= CARDINAL 	freq= 36041


And let's try a smaller community.

In [151]:
louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 22].head(5)

Unnamed: 0,louvain_community,title_raw,weight,title,named_entities,title_parsed,scaled_weight
983,22,Michael_Schumacher,178035,Michael Schumacher,PERSON,michael schumach,12.09
1154,22,"Tesla,_Inc.",197704,"Tesla, Inc.",ORG,tesla,12.19
1505,22,Mercedes-Benz,143990,Mercedes-Benz,ORG,merced benz,11.88
1693,22,Bitter_Cars,117034,Bitter Cars,,bitter car,11.67
2359,22,BMW,117757,BMW,ORG,bmw,11.68


In [152]:
test_community3 = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == 22].copy()

In [153]:
len(test_community3)

37983

In [154]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

comm = test_community3

nrows = len(comm)
# nrows = 10
print("Num of rows:", nrows)

#article_docs = []
title_docs = []
ner_docs = []

last_i = 0

for i in range(nrows):
    title_parsed_arr = comm.iloc[i].title_parsed.split()
    named_entities_arr = comm.iloc[i].named_entities.split()
    weight = max(round(comm.iloc[i].scaled_weight).astype("int64"), 1)
    
    # article_docs.append((title_parsed_arr + named_entities_arr) * weight)
    for j in range(weight):
        #article_docs.append(title_parsed_arr + named_entities_arr)
        title_docs.append(title_parsed_arr)
        ner_docs.append(named_entities_arr)
    #article_docs.append(title_parsed_arr + named_entities_arr)
    
    
    if ( (i % 100000 == 0) and (i != last_i) ):
        print("Rows processed:", round(i * 100/len(comm), 4), "%,", "count = ", i )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last row's processing vars:")
        print("i=", i, "title_parsed_arr=", title_parsed_arr, 
              "named_entities_arr=", named_entities_arr,
              "weight=", weight, "\n")
        last_i = i
    
cu.printRunTime(start_time)


Started running at 2019-03-03 20:03:44.545706 UTC
Num of rows: 37983


Runtime: 0.29 min



In [155]:
titles_dictionary = gensim.corpora.Dictionary(title_docs)


In [156]:
top_freq_word_ids = sorted(titles_dictionary.dfs, key=titles_dictionary.dfs.__getitem__, reverse=True)[:20]

for idx in top_freq_word_ids:
    print("token=", titles_dictionary[idx], "\tfreq=", titles_dictionary.dfs[idx])

token= grand 	freq= 10329
token= prix 	freq= 9949
token= championship 	freq= 6733
token= seri 	freq= 6195
token= engin 	freq= 6132
token= race 	freq= 6073
token= car 	freq= 5593
token= formula 	freq= 4185
token= list 	freq= 4161
token= ford 	freq= 4154
token= honda 	freq= 3680
token= motorcycl 	freq= 3466
token= motor 	freq= 3441
token= world 	freq= 3335
token= automobil 	freq= 2688
token= toyota 	freq= 2684
token= driver 	freq= 2427
token= merced 	freq= 2388
token= bmw 	freq= 2303
token= benz 	freq= 2291


In [157]:
ner_dictionary = gensim.corpora.Dictionary(ner_docs)


In [158]:
top_freq_word_ids = sorted(ner_dictionary.dfs, key=ner_dictionary.dfs.__getitem__, reverse=True)[:5]

for idx in top_freq_word_ids:
    print("token=", ner_dictionary[idx], "\tfreq=", ner_dictionary.dfs[idx])

token= ORG 	freq= 65251
token= PERSON 	freq= 64819
token= DATE 	freq= 23494
token= GPE 	freq= 14602
token= CARDINAL 	freq= 11542


In [159]:
len(title_docs)

214594

In [160]:
ntd = len(title_docs)

top_freq_word_ids = sorted(ner_dictionary.dfs, key=ner_dictionary.dfs.__getitem__, reverse=True)[:5]

for idx in top_freq_word_ids:
    print("token=", ner_dictionary[idx], "\tfreq % =", ner_dictionary.dfs[idx]/ntd)

token= ORG 	freq % = 0.30406721529958897
token= PERSON 	freq % = 0.30205411148494365
token= DATE 	freq % = 0.10948115977147543
token= GPE 	freq % = 0.06804477292002573
token= CARDINAL 	freq % = 0.05378528756628797


#### Make community descriptors for all communities

In [14]:
louvain_community_ids = louvain_communities_for_NLP_proc.louvain_community.unique().tolist()
louvain_community_ids[:5]

[3, 4, 9, 1, 5]

In [15]:
len(louvain_community_ids)

1698

In [40]:
print("Started running at", datetime.now(), "UTC")
start_time = timer()

community_topics_dict = {}
count = 0

for cid in louvain_community_ids:
    # get a community
    comm = louvain_communities_for_NLP_proc[louvain_communities_for_NLP_proc.louvain_community == cid]
    
    # put together title and NER doc lists per community
    nrows = len(comm)
    title_docs = []
    ner_docs = []

    for i in range(nrows):
        title_parsed_arr = comm.iloc[i].title_parsed.split()
        named_entities_arr = comm.iloc[i].named_entities.split()
        weight = max(round(comm.iloc[i].scaled_weight).astype("int64"), 1)

        for j in range(weight):
            title_docs.append(title_parsed_arr)
            ner_docs.append(named_entities_arr)
    
    # Make a gensim dictionary from title docs
    titles_dictionary = gensim.corpora.Dictionary(title_docs)
    # Get top 20 most frequent words used in the article titles within the given community
    topic_word_ids = sorted(titles_dictionary.dfs, key=titles_dictionary.dfs.__getitem__, reverse=True)[:20]

    # store the top 20 title words and their weights (weighted frequencies, kinda)
    topic_words_arr = []
    topic_words_weights = []
    for wid in topic_word_ids:
        topic_words_arr.append(titles_dictionary[wid])
        topic_words_weights.append(titles_dictionary.dfs[wid])
    
    # Make a gensim dictionary from NER docs
    ner_dictionary = gensim.corpora.Dictionary(ner_docs)
    # Get top 20 most frequent NERs used in the article titles within the given community
    topic_ner_ids = sorted(ner_dictionary.dfs, key=ner_dictionary.dfs.__getitem__, reverse=True)[:20]

    # store the top 20 title words and their weights (weighted frequencies, kinda)
    topic_ner_arr = []
    topic_ner_weights = []
    for nid in topic_ner_ids:
        topic_ner_arr.append(ner_dictionary[nid])
        topic_ner_weights.append(ner_dictionary.dfs[nid])
    
    # store the community topic arrays in a dict
    community_topics_dict[cid] = {}
    community_topics_dict[cid]["topic_words"] = topic_words_arr
    community_topics_dict[cid]["topic_words_weights"] = topic_words_weights
    community_topics_dict[cid]["topic_ner"] = topic_ner_arr
    community_topics_dict[cid]["topic_ner_weights"] = topic_ner_weights
    
    
    if (count % 100 == 0):
        print("Communities processed:", round(count * 100/len(louvain_community_ids), 4), "%,", "count = ", count )
        print("Elapsed time:", round((timer() - start_time)/60, 4), "min\n")
        print("Last community's processing vars:")
        print("cid=", cid, "\ncommunity_topics_dict entry:\n", community_topics_dict[cid])
    count += 1
    
cu.printRunTime(start_time)

Started running at 2019-03-03 23:47:18.127757 UTC
Communities processed: 0.0 %, count =  0
Elapsed time: 4.5866 min

Last community's processing vars:
cid= 3 
community_topics_dict entry:
 {'topic_words': ['list', 'unit', 'state', 'footbal', 'elect', 'nation', 'th', 'constitu', 'parti', 'battl', 'john', 'film', 'st', 'district', 'war', 'cup', 'disambigu', 'india', 'leagu', 'world'], 'topic_words_weights': [86275, 62493, 55433, 49587, 41775, 39424, 30185, 29297, 26282, 23894, 23292, 22708, 22624, 20653, 19766, 17757, 16889, 16388, 15638, 15583], 'topic_ner': ['PERSON', 'ORG', 'GPE', 'DATE', 'NORP', 'CARDINAL', 'EVENT', 'ORDINAL', 'LOC', 'FAC', 'PRODUCT', 'LAW', 'WORK_OF_ART', 'LANGUAGE', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT'], 'topic_ner_weights': [1088529, 559783, 426692, 171238, 112900, 43759, 37994, 30215, 28239, 14446, 5942, 3857, 3657, 2923, 1980, 811, 496, 112]}
Communities processed: 5.8893 %, count =  100
Elapsed time: 24.535 min

Last community's processing vars:
cid= 400 
com

Runtime: 24.78 min



In [41]:
community_topics_dict[22]

{'topic_ner': ['ORG',
  'PERSON',
  'DATE',
  'GPE',
  'CARDINAL',
  'EVENT',
  'PRODUCT',
  'NORP',
  'TIME',
  'ORDINAL',
  'FAC',
  'LOC',
  'LAW',
  'WORK_OF_ART',
  'QUANTITY',
  'LANGUAGE',
  'PERCENT',
  'MONEY'],
 'topic_ner_weights': [65251,
  64819,
  23494,
  14602,
  11542,
  11079,
  5620,
  5000,
  1148,
  953,
  663,
  585,
  293,
  229,
  218,
  55,
  28,
  4],
 'topic_words': ['grand',
  'prix',
  'championship',
  'seri',
  'engin',
  'race',
  'car',
  'formula',
  'list',
  'ford',
  'honda',
  'motorcycl',
  'motor',
  'world',
  'automobil',
  'toyota',
  'driver',
  'merced',
  'bmw',
  'benz'],
 'topic_words_weights': [10329,
  9949,
  6733,
  6195,
  6132,
  6073,
  5593,
  4185,
  4161,
  4154,
  3680,
  3466,
  3441,
  3335,
  2688,
  2684,
  2427,
  2388,
  2303,
  2291]}

In [42]:
# pickle the output
myoutfile = "pickles/en_1218_louvain_community_topics_dict.pkl"
with open(myoutfile, 'wb') as picklefile:
    pickle.dump(community_topics_dict, picklefile)

print("Pickle created: " + myoutfile)

Pickle created: pickles/en_1218_louvain_community_topics_dict.pkl


In [43]:
# unpickle
with open("pickles/en_1218_louvain_community_topics_dict.pkl", 'rb') as picklefile: 
    community_topics_dict = pickle.load(picklefile)

community_topics_dict[22]

{'topic_ner': ['ORG',
  'PERSON',
  'DATE',
  'GPE',
  'CARDINAL',
  'EVENT',
  'PRODUCT',
  'NORP',
  'TIME',
  'ORDINAL',
  'FAC',
  'LOC',
  'LAW',
  'WORK_OF_ART',
  'QUANTITY',
  'LANGUAGE',
  'PERCENT',
  'MONEY'],
 'topic_ner_weights': [65251,
  64819,
  23494,
  14602,
  11542,
  11079,
  5620,
  5000,
  1148,
  953,
  663,
  585,
  293,
  229,
  218,
  55,
  28,
  4],
 'topic_words': ['grand',
  'prix',
  'championship',
  'seri',
  'engin',
  'race',
  'car',
  'formula',
  'list',
  'ford',
  'honda',
  'motorcycl',
  'motor',
  'world',
  'automobil',
  'toyota',
  'driver',
  'merced',
  'bmw',
  'benz'],
 'topic_words_weights': [10329,
  9949,
  6733,
  6195,
  6132,
  6073,
  5593,
  4185,
  4161,
  4154,
  3680,
  3466,
  3441,
  3335,
  2688,
  2684,
  2427,
  2388,
  2303,
  2291]}

In [48]:
k = 50
for cid in louvain_community_ids[:k]:
    print("\nCommunity id =", cid)
    print("Community topic words:\n", " ".join(community_topics_dict[cid]["topic_words"]))
    print("Community topic NER:\n", " ".join(community_topics_dict[cid]["topic_ner"]))


Community id = 3
Community topic words:
 list unit state footbal elect nation th constitu parti battl john film st district war cup disambigu india leagu world
Community topic NER:
 PERSON ORG GPE DATE NORP CARDINAL EVENT ORDINAL LOC FAC PRODUCT LAW WORK_OF_ART LANGUAGE QUANTITY MONEY TIME PERCENT

Community id = 4
Community topic words:
 film list seri tv actor comic season award disambigu episod school john charact novel man star love song album david
Community topic NER:
 PERSON ORG DATE GPE CARDINAL NORP WORK_OF_ART EVENT LOC ORDINAL FAC PRODUCT TIME LAW LANGUAGE QUANTITY MONEY PERCENT

Community id = 9
Community topic words:
 film list templ award sri tamil actor actress tv colleg disambigu best lanka seri station krishna district raja kannada oru
Community topic NER:
 PERSON ORG GPE DATE NORP CARDINAL LOC EVENT ORDINAL FAC WORK_OF_ART PRODUCT LAW LANGUAGE PERCENT TIME QUANTITY

Community id = 1
Community topic words:
 list disambigu softwar power station number engin space unit 

In [49]:
cid = 11
print("\nCommunity id =", cid)
print("Community topic words:\n", " ".join(community_topics_dict[cid]["topic_words"]))
print("Community topic NER:\n", " ".join(community_topics_dict[cid]["topic_ner"]))


Community id = 11
Community topic words:
 footbal team season school basketbal high list american basebal state leagu men bowl new colleg nation univers game john stadium
Community topic NER:
 PERSON ORG DATE GPE NORP EVENT CARDINAL FAC LOC PRODUCT ORDINAL WORK_OF_ART LAW TIME LANGUAGE QUANTITY MONEY


In [216]:
words = "novel book read"
words = "album discography music song"
words = "united states"


preprocessed = gensim.utils.simple_preprocess(words)
    
lemmatized = []
for w in preprocessed:
    stem = stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v'))
    lemmatized.append(stem)
    
print(lemmatized)

['unit', 'state']


In [218]:
# find communities that are related to science

for cid in louvain_community_ids:
    #if (lemmatized in community_topics_dict[cid]["topic_words"]):
    #if not set(lemmatized).isdisjoint(set(community_topics_dict[cid]["topic_words"])):
    if set(lemmatized).issubset(set(community_topics_dict[cid]["topic_words"])):
        print("\nCommunity id =", cid)
        print("Community topic words:\n", " ".join(community_topics_dict[cid]["topic_words"]))
        print("Community topic NER:\n", " ".join(community_topics_dict[cid]["topic_ner"]))


Community id = 3
Community topic words:
 list unit state footbal elect nation th constitu parti battl john film st district war cup disambigu india leagu world
Community topic NER:
 PERSON ORG GPE DATE NORP CARDINAL EVENT ORDINAL LOC FAC PRODUCT LAW WORK_OF_ART LANGUAGE QUANTITY MONEY TIME PERCENT

Community id = 6
Community topic words:
 univers school list colleg bank institut compani state intern educ group law technolog scienc unit nation john mall new busi
Community topic NER:
 ORG PERSON GPE NORP DATE CARDINAL FAC LOC ORDINAL EVENT WORK_OF_ART LANGUAGE PRODUCT LAW TIME MONEY QUANTITY PERCENT

Community id = 10
Community topic words:
 state list new counti school unit california york station park district elect john citi texa high nation hous north river
Community topic NER:
 PERSON GPE ORG DATE LOC FAC CARDINAL NORP ORDINAL EVENT WORK_OF_ART PRODUCT LANGUAGE LAW TIME MONEY QUANTITY PERCENT

Community id = 2
Community topic words:
 list disambigu syndrom nation park diseas food b