### Import libraries and functions

In [1]:
import spacy
import pandas as pd
import numpy as np
import re
import requests
import sys
import json
from tqdm import tqdm
from pathlib import Path

sys.path.insert(0, "../")
# Imports for NLP
from nlp import beautifulsoup as bsp
from nlp import nlp_preprocessing as nlp_prep
from nlp.read_warc2 import read_warc
from corpus_processing import ner
from corpus_processing import relation_extraction as re
from corpus_processing import relation_linking as rl
from corpus_processing import entity_linking as el 
from nlp import coref as cr

### Load WARC data

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# autoreload
%load_ext autoreload
%autoreload 2

DIR_DATA = Path("../data/warcs")
FNAME_WARC = "sample.warc.gz"

In [3]:
warc_df = read_warc(DIR_DATA / FNAME_WARC)

reading warc file...
warc list ['clueweb12-0000tw-00-00006' 'clueweb12-0000tw-00-00043'
 'clueweb12-0000tw-00-00044' 'clueweb12-0000tw-00-00101'
 'clueweb12-0000tw-00-00129' 'clueweb12-0000tw-00-00130'
 'clueweb12-0000tw-00-00141' 'clueweb12-0000tw-00-00160'
 'clueweb12-0000tw-00-00161' 'clueweb12-0000tw-00-00162'
 'clueweb12-0000tw-00-00199' 'clueweb12-0000tw-00-00205'
 'clueweb12-0000tw-00-00210' 'clueweb12-0000tw-00-00212'
 'clueweb12-0000tw-00-00214' 'clueweb12-0000tw-00-00829'
 'clueweb12-0000tw-00-00833' 'clueweb12-0000tw-00-00834'
 'clueweb12-0000tw-00-00835' 'clueweb12-0000tw-00-00836'
 'clueweb12-0000tw-00-00837' 'clueweb12-0000tw-00-00843'
 'clueweb12-0000tw-00-00844' 'clueweb12-0000tw-00-00877'
 'clueweb12-0000tw-00-00879' 'clueweb12-0000tw-00-00880'
 'clueweb12-0000tw-00-00881' 'clueweb12-0000tw-00-00941'
 'clueweb12-0000tw-00-00942' 'clueweb12-0000tw-00-00943'
 'clueweb12-0000tw-00-00944' 'clueweb12-0000tw-00-00946']


In [4]:
warc_df

Unnamed: 0,HTML_DOC,WARC-TREC-ID
0,"<html>\r\n\r\n<head>\r\n<meta http-equiv=""Cont...",clueweb12-0000tw-00-00006
1,"<html>\n\n<head>\n\n<style type=""text/css"">\n\...",clueweb12-0000tw-00-00043
2,"<html>\n<head>\n<style type=""text/css"">\n#bott...",clueweb12-0000tw-00-00044
3,<html>\n<head>\n\n<title>DHP Concerts Bristol ...,clueweb12-0000tw-00-00101
4,<html>\n<head>\n <title>12 Tourist Spots in ...,clueweb12-0000tw-00-00129
5,<html>\r\n<head>\r\n<title>Saving Money Buying...,clueweb12-0000tw-00-00130
6,<html>\n<head>\n <title>Freelance Web Desig...,clueweb12-0000tw-00-00141
7,"<html>\n<head>\n<meta http-equiv=""Content-Type...",clueweb12-0000tw-00-00160
8,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00161
9,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00162


In [5]:
selected_doc = warc_df.iloc[18]
stripped_webpage = bsp.scrape_webpage(selected_doc['HTML_DOC'])

In [6]:
spacy_processor = spacy.load("en_core_web_trf")
spacy_doc = nlp_prep.get_nlp_doc(stripped_webpage, spacy_processor)
processed_page = nlp_prep.nlp_preprocessing(spacy_doc)

In [7]:
ner_page = ner.detect_entities(spacy_doc)
ner_page

Unnamed: 0,label,ner_type
0,UP TO 50%,PERCENT
1,8 hours ago,TIME
2,@mduscavage,PERSON
3,5 hours ago,TIME
4,OP,PERSON
5,mduscavage,PERSON
7,first,ORDINAL
8,3 hours ago,TIME
17,REI,ORG
18,2/9/12 9,TIME


In [8]:
allowed_entity_types = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE']

# Filter out entities that are not in allowed_entity_types
filtered_entities = ner_page.loc[ner_page['ner_type'].isin(allowed_entity_types), 'label']
filtered_entities

2           @mduscavage
4                    OP
5            mduscavage
17                  REI
19              REI.com
22                 U.S.
24            Toblerone
29                 Woot
30    Woot Services LLC
31               Deals.
36            2004-2012
37                Deals
42           Deals.Woot
Name: label, dtype: object

In [9]:
candidate_list = []
candidate_indices = []
for entity in tqdm(filtered_entities, desc="Entity"):
    candidates = el._get_wikipedia_articles(entity)
    if candidates != []:
        candidate_indices.append(int(len(candidates)))

    candidate_list += candidates

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Entity: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s]


In [10]:
candidate_views = el.p.article_views("en.wikipedia", candidate_list, granularity="monthly", start="2022010100", end="2022103100")
first_key = list(candidate_views.keys())[0]
candidate_views = candidate_views[first_key]
candidate_views.items()

dict_items([('OP', 3028), ('Op-ed', 16280), ('Cooperative_video_game', 8700), ('Takt_Op', 13801), ('Operational_amplifier', 32482), ('Op._cit.', 3037), ('Photo_op', 3427), ('OP_Financial_Group', 2607), ('Op_art', 18744), ('KO_OP', 414), ('REI', 12937), ('Rei', 1253), ('Rei_Rei', 728), ('Rei_Ayanami', 15401), ('Reis', 689), ('Cristo_Rei', 212), ('Sailor_Mars', 9516), ('Rei_(given_name)', 3051), ('Rei_Kikukawa', 480), ('Rei_Harakami', 1110), ('Deus_Salve_o_Rei', 1738), ('Novara_(company)', 848), ('Rei_Kawakubo', 9313), ('Lead_climbing', 4041), ('Figure_8_(belay_device)', 942), ('Compass', 45322), ('Sierra_Nevada', 37159), ('U.S._state', 369540), ('United_States', 1795337), ('Georgia_(U.S._state)', 188445), ('U.S._Cremonese', 40338), ('U.S._Agent', 23492), ('U.S._Bancorp', 20782), ('U.S._Woodland', 8028), ('U.S._Steel', 19109), ('U.S._Lecce', 33036), ('U-S-A!', 708), ('Toblerone', 47421), ('Toblerone_line', 1102), ('Mona_Sahlin', 4813), ("Dragon's_teeth_(fortification)", 6147), ('Theodor_

In [11]:
def replace_none(items):
    result = {}
    for key, value in items:
        if value is None:
            value = 0
        result[key] = value
    return result

In [12]:
def determine_max_views(x):
    # make second column into integer
    x[:,1] = x[:,1].astype(int)
    # return index of max value
    return np.argmax(x[:,1], axis=0)

In [13]:
json_str = json.dumps(candidate_views)
candidate_views = json.loads(json_str, object_pairs_hook=replace_none)

In [14]:
candidate_tuple_array = np.array(list(candidate_views.items()))

# split candidate_views into chunks of size candidate_indices
candidate_tuple_array = np.split(candidate_tuple_array, np.cumsum(candidate_indices)[:-1])

# determine max views for each chunk
max_views = np.array([determine_max_views(chunk) for chunk in candidate_tuple_array[:-1]])

# get the keys of the max views
max_views_keys = [chunk[max_view][0] for chunk, max_view in zip(candidate_tuple_array, max_views)]

max_views_keys

['Cooperative_video_game',
 'Sailor_Mars',
 'Figure_8_(belay_device)',
 'U.S._Woodland',
 'Theodor_Tobler',
 'Amazon_(company)',
 'Deals',
 'Erin_Andrews',
 'Sulli_Deals',
 'Meh']