In [1]:
import pandas as pd
import numpy as np

import re

from py2neo import authenticate, Graph, Node, Relationship


import os
import csv
import pickle

from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from IPython.display import display, HTML

# custom general helper functions for this project
import custom_utils as cu
import importlib


In [2]:
from collections import defaultdict

In [3]:
# reload imports as needed
importlib.reload(cu);

In [4]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

In [5]:
pd.options.display.max_colwidth = 100

In [6]:
# unpickle
with open("pickles/en_1218_louvain_communities_for_NLP.pkl", 'rb') as picklefile: 
    louvain_communities_for_NLP = pickle.load(picklefile)

louvain_communities_for_NLP.head(20)

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title
0,4576854.0,1108189.0,3,5630.0,George_H._W._Bush
1,3538068.0,639353.0,4,6451.0,Jason_Momoa
2,3475113.0,223635.0,9,23563.0,2.0_(film)
3,3251996.0,682992.0,4,10416.0,Bird_Box_(film)
4,3020671.0,31170.0,1,,Main_Page
5,2634665.0,408421.0,4,34309.0,Aquaman_(film)
6,2328884.0,200893.0,4,192.0,Bird_Box
7,2231176.0,575481.0,3,3945.0,Priyanka_Chopra
8,2226602.0,117115.0,5,958.0,List_of_most-disliked_YouTube_videos
9,2050628.0,336621.0,5,4161.0,Freddie_Mercury


In [37]:
louvain_communities_for_NLP[louvain_communities_for_NLP.external_search_traffic.isnull()][:5]

Unnamed: 0,external_search_traffic,link_in_traffic,louvain_community,search_in_traffic,title
2401006,,12.0,2,,Orange-brown_Atlantic_tree-rat
2401007,,16.0,4,,Eilis_Kirwan
2401008,,10.0,1,,Emile_Waxweiler
2401009,,213.0,7,,Sun_King_(disambiguation)
2401010,,29.0,3,,Ahmet_Lepenica


In [75]:
community_deepWiki_stats = louvain_communities_for_NLP[louvain_communities_for_NLP.external_search_traffic.isnull()] \
    .groupby("louvain_community")[["title"]].count()
    
community_deepWiki_stats.columns = ["deep_wiki_articles_count"]

community_deepWiki_stats[:10]

Unnamed: 0_level_0,deep_wiki_articles_count
louvain_community,Unnamed: 1_level_1
0,3311
1,16439
2,28617
3,71293
4,22471
5,27513
6,5251
7,37641
8,10764
9,2280


In [36]:
# unpickle
with open("pickles/en_1218_louvain_communities.pkl", 'rb') as picklefile: 
    louvain_communities = pickle.load(picklefile)

louvain_communities.head(5)

Unnamed: 0,articles_count,external_search_traffic,link_edges_count,link_traffic,louvain_community,search_edges_count,search_traffic,total_visits,avg_external_search_traffic,avg_link_traffic_per_edge,avg_visits_per_article,link_network_density,link_network_density_delta
0,507642,535227964,3385462,329625267,3,102124,4188154,1124655652,1054.34,97.36,2215.45,0.0,5.67
1,302120,288453340,2037310,140416238,7,41383,1257742,592415339,954.76,68.92,1960.86,0.0,5.74
2,267541,626988532,2540296,461475849,4,98857,3938984,1350876078,2343.52,181.66,5049.23,0.0,8.5
3,233396,150495954,1209989,84128861,10,25894,932688,322074785,644.81,69.53,1379.95,0.0,4.18
4,223864,219874886,1327054,151652674,5,32332,930331,468611692,982.18,114.28,2093.29,0.0,4.93


In [73]:
louvain_communities_size = louvain_communities.set_index("louvain_community").articles_count
louvain_communities_size.head()

louvain_community
3     507642
7     302120
4     267541
10    233396
5     223864
Name: articles_count, dtype: int64

In [77]:
community_deepWiki_stats = community_deepWiki_stats.join(louvain_communities_size)
community_deepWiki_stats.head()

Unnamed: 0_level_0,deep_wiki_articles_count,articles_count
louvain_community,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3311,29397
1,16439,218001
2,28617,209065
3,71293,507642
4,22471,267541


In [78]:
community_deepWiki_stats["deep_wiki_articles_proportion"] = \
    community_deepWiki_stats.deep_wiki_articles_count / community_deepWiki_stats.articles_count
    
community_deepWiki_stats.head()

Unnamed: 0_level_0,deep_wiki_articles_count,articles_count,deep_wiki_articles_proportion
louvain_community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3311,29397,0.11
1,16439,218001,0.08
2,28617,209065,0.14
3,71293,507642,0.14
4,22471,267541,0.08


In [83]:
# unpickle
with open("pickles/en_1218_louvain_community_topics_dict.pkl", 'rb') as picklefile: 
    community_topics_dict = pickle.load(picklefile)

community_topics_dict[22]["topic_words"][:5]

['grand', 'prix', 'championship', 'seri', 'engin']

In [84]:
community_ids = community_deepWiki_stats.index.values

for i in community_ids:
    
    #print(community_deepWiki_stats.at[i, "deep_wiki_articles_count"] )
    
    community_deepWiki_stats.at[i, "topic_words"] = " ".join(community_topics_dict[i]["topic_words"])
       

In [85]:
community_deepWiki_stats[:10]

Unnamed: 0_level_0,deep_wiki_articles_count,articles_count,deep_wiki_articles_proportion,topic_words
louvain_community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3311,29397,0.11,station railway class line rail metro list park train transport british london south road tram c...
1,16439,218001,0.08,list disambigu softwar power station number engin space unit program network model theorem group...
2,28617,209065,0.14,list disambigu syndrom nation park diseas food black acid hospit state red white cell medic dog ...
3,71293,507642,0.14,list unit state footbal elect nation th constitu parti battl john film st district war cup disam...
4,22471,267541,0.08,film list seri tv actor comic season award disambigu episod school john charact novel man star l...
5,27513,223864,0.12,album song band list discographi love music live musician record tour disambigu ep best world aw...
6,5251,81807,0.06,univers school list colleg bank institut compani state intern educ group law technolog scienc un...
7,37641,302120,0.12,list disambigu church al john novel saint languag histori st film battl ii cathol roman book wil...
8,10764,73172,0.15,airport air uss list class hms airlin intern squadron flight submarin forc aircraft ship th avia...
9,2280,42420,0.05,film list templ award sri tamil actor actress tv colleg disambigu best lanka seri station krishn...


In [89]:
community_deepWiki_stats[community_deepWiki_stats.articles_count >= 100].sort_values("deep_wiki_articles_proportion", ascending=False)[:50]

Unnamed: 0_level_0,deep_wiki_articles_count,articles_count,deep_wiki_articles_proportion,topic_words
louvain_community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59,147,159,0.92,kingdom list locat unit templecomb zeal st west cornwal new aa ak old north monachorum zouch eas...
196,62,111,0.56,art tabl year archaeolog architectur
136,44,120,0.37,music paul rapin comet hanov treati peac vienna del francesco giudic
84,76,278,0.27,orchestra symphoni philharmon state youth dispatch new chamber san album fiedler pop unit sander...
12,13319,49144,0.27,olymp championship summer men world women game athlet metr open winter cup singl list swim tenni...
160,29,111,0.26,cyprus elect cypriot legisl presidenti parti democrat list movement newspap european anastasiad ...
19,5206,22194,0.23,footbal leagu cup nation malaysia district stadium championship fc list team asian afc station f...
44,717,3171,0.23,snooker dart world championship player master open pdc cup uk bdo leagu pool grand rank season t...
76,35,166,0.21,bc decad mentuemhat
123,57,273,0.21,album sound pass joe juli yusef lateef london blue jazz live song randi virtuoso meredith monk s...
