In [1]:
import pandas as pd
import numpy as np



from py2neo import authenticate, Graph, Node, Relationship


import os
import csv
import pickle

from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from IPython.display import display, HTML

# custom general helper functions for this project
import custom_utils as cu
import importlib


In [2]:
# reload imports as needed
importlib.reload(cu);

In [3]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

In [5]:
pd.options.display.max_colwidth = 100

In [9]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP = pickle.load(picklefile)

louvain_communities_for_NLP.head(20)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page
5,2634665.0,408421.0,4,34309.0,Aquaman_(film)
6,2328884.0,200893.0,4,192.0,Bird_Box
7,2231176.0,575481.0,3,3945.0,Priyanka_Chopra
8,2226602.0,117115.0,5,958.0,List_of_most-disliked_YouTube_videos
9,2050628.0,336621.0,5,4161.0,Freddie_Mercury


##### Pre-process article titles

In [10]:
louvain_communities_for_NLP_proc =louvain_communities_for_NLP.copy()
louvain_communities_for_NLP_proc.rename(index=str,columns={'title':'title_raw'}, inplace=True)
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title_raw
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page


In [12]:
# the importance of words will be weighted by search and link traffic
louvain_communities_for_NLP_proc["weight"] = louvain_communities_for_NLP_proc[[
        "external_search_traffic", "link_in_traffic", "search_in_traffic"]].sum(axis=1).astype('int64').fillna(0)

louvain_communities_for_NLP_proc.drop(["external_search_traffic", "link_in_traffic", "search_in_traffic"], 
                                      axis = 1,
                                      inplace=True)

In [13]:
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight
0,3,George_H._W._Bush,5690673
1,4,Jason_Momoa,4183872
2,9,2.0_(film),3722311
3,4,Bird_Box_(film),3945404
4,1,Main_Page,3051841


In [14]:
# clean up the title
louvain_communities_for_NLP_proc["title"] = \
    louvain_communities_for_NLP_proc.title_raw.str.replace('_', ' ')
    
louvain_communities_for_NLP_proc.head(5)

Unnamed: 0,louvain_community,title_raw,weight,title
0,3,George_H._W._Bush,5690673,George H. W. Bush
1,4,Jason_Momoa,4183872,Jason Momoa
2,9,2.0_(film),3722311,2.0 (film)
3,4,Bird_Box_(film),3945404,Bird Box (film)
4,1,Main_Page,3051841,Main Page


In [15]:
# Use spaCy to get Named Entities

# installation instructions: https://spacy.io/usage/

import spacy
nlp = spacy.load('en')

In [None]:
print("Started running at", datetime.now(), "UTC")

start_time = timer()

for i in range(len(louvain_communities_for_NLP_proc)):
    txt = louvain_communities_for_NLP_proc.iloc[i].title

    doc = nlp(txt)

    ents_arr = []
    for ent in doc.ents:
        ents_arr.append(ent.label_)
        
        # print a sample
        if i < 10:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
    louvain_communities_for_NLP_proc.at[i, "named_entities"] = " ".join(ents_arr)
    
    
    if i % 100000 == 0:
        print("Rows processed:", round(i * 100/len(louvain_communities_for_NLP_proc), 2), "%" )
        print("Elapsed time:", round((timer() - start_time)/60, 2), "min\n")
    

cu.printRunTime(start_time)

Started running at 2019-03-01 04:48:40.767486 UTC
George H. W. Bush 0 17 PERSON
Rows processed: 0.0 %
Elapsed time: 0.04 min

Jason Momoa 0 11 PERSON
2.0 0 3 CARDINAL
Bird Box 0 8 PERSON
Main Page 0 9 PERSON
Bird Box 0 8 PERSON
Priyanka Chopra 0 15 PERSON
YouTube 22 29 ORG
Freddie Mercury 0 15 ORG


In [None]:
louvain_communities_for_NLP_proc.head(20)