### Import libraries and functions

In [1]:
import spacy
import pandas as pd
import numpy as np
import re
import requests
import sys
from tqdm import tqdm
from pathlib import Path

sys.path.insert(0, "../")
# Imports for NLP
from nlp import beautifulsoup as bsp
from nlp import nlp_preprocessing as nlp_prep
from nlp.read_warc2 import read_warc
from corpus_processing import ner
from corpus_processing import relation_extraction as re
from corpus_processing import relation_linking as rl
from corpus_processing import entity_linking as el 
from nlp import coref as cr

### Load WARC data

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# autoreload
%load_ext autoreload
%autoreload 2

DIR_DATA = Path("../data/warcs")
FNAME_WARC = "sample.warc.gz"

In [6]:
warc_df = read_warc(DIR_DATA / FNAME_WARC)

reading warc file...
warc list ['clueweb12-0000tw-00-00006' 'clueweb12-0000tw-00-00043'
 'clueweb12-0000tw-00-00044' 'clueweb12-0000tw-00-00101'
 'clueweb12-0000tw-00-00129' 'clueweb12-0000tw-00-00130'
 'clueweb12-0000tw-00-00141' 'clueweb12-0000tw-00-00160'
 'clueweb12-0000tw-00-00161' 'clueweb12-0000tw-00-00162'
 'clueweb12-0000tw-00-00199' 'clueweb12-0000tw-00-00205'
 'clueweb12-0000tw-00-00210' 'clueweb12-0000tw-00-00212'
 'clueweb12-0000tw-00-00214' 'clueweb12-0000tw-00-00829'
 'clueweb12-0000tw-00-00833' 'clueweb12-0000tw-00-00834'
 'clueweb12-0000tw-00-00835' 'clueweb12-0000tw-00-00836'
 'clueweb12-0000tw-00-00837' 'clueweb12-0000tw-00-00843'
 'clueweb12-0000tw-00-00844' 'clueweb12-0000tw-00-00877'
 'clueweb12-0000tw-00-00879' 'clueweb12-0000tw-00-00880'
 'clueweb12-0000tw-00-00881' 'clueweb12-0000tw-00-00941'
 'clueweb12-0000tw-00-00942' 'clueweb12-0000tw-00-00943'
 'clueweb12-0000tw-00-00944' 'clueweb12-0000tw-00-00946']


In [7]:
warc_df

Unnamed: 0,HTML_DOC,WARC-TREC-ID
0,"<html>\r\n\r\n<head>\r\n<meta http-equiv=""Cont...",clueweb12-0000tw-00-00006
1,"<html>\n\n<head>\n\n<style type=""text/css"">\n\...",clueweb12-0000tw-00-00043
2,"<html>\n<head>\n<style type=""text/css"">\n#bott...",clueweb12-0000tw-00-00044
3,<html>\n<head>\n\n<title>DHP Concerts Bristol ...,clueweb12-0000tw-00-00101
4,<html>\n<head>\n <title>12 Tourist Spots in ...,clueweb12-0000tw-00-00129
5,<html>\r\n<head>\r\n<title>Saving Money Buying...,clueweb12-0000tw-00-00130
6,<html>\n<head>\n <title>Freelance Web Desig...,clueweb12-0000tw-00-00141
7,"<html>\n<head>\n<meta http-equiv=""Content-Type...",clueweb12-0000tw-00-00160
8,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00161
9,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00162


In [44]:
selected_doc = warc_df.iloc[18]
stripped_webpage = bsp.scrape_webpage(selected_doc['HTML_DOC'])

array(['\nMy account |\r\n            Log out\n',
       'Hi are you new? Start here.',
       'SAVE UP TO 50% On selected ski, snowboard and end-of-season clothing & gear.',
       '\r\n                by\r\n                \ncheshire255\n\r\n                added 8 hours ago\n',
       '@mduscavage Yes, that link works, but you have to copy and paste it, not click the link. Both links error when you just click them. But thanks for the help!',
       '\n\nthe18thtee84\n\n\n\n5 hours ago\n\n',
       'Try this link: http://www.rei.com/Sale+and+Clearance When I used the other I kept getting an error. Great sale, thanks OP!',
       '\n\nmduscavage\n\n\n\n5 hours ago\n\n',
       'hmm some of the things on the list arent listed on sale or clearance. At first glance, fivefingers and toe socks (which I was looking for!) were not discounted at all.',
       '\n\ndodey23\n\n\n\n3 hours ago\n\n',
       'Try this link: http://www.rei.com/Sale+and+Clearance When I used the other I kept getting

In [10]:
spacy_processor = spacy.load("en_core_web_trf")
spacy_doc = nlp_prep.get_nlp_doc(stripped_webpage, spacy_processor)
processed_page = nlp_prep.nlp_preprocessing(spacy_doc)

In [11]:
ner_page = ner.detect_entities(spacy_doc)
ner_page

Unnamed: 0,label,ner_type
0,UP TO 50%,PERCENT
1,8 hours ago,TIME
2,@mduscavage,PERSON
3,5 hours ago,TIME
4,OP,PERSON
5,mduscavage,PERSON
7,first,ORDINAL
8,3 hours ago,TIME
17,REI,ORG
18,2/9/12 9,TIME


In [43]:
allowed_entity_types = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE']

# Filter out entities that are not in allowed_entity_types
filtered_entities = ner_page[ner_page['ner_type'].isin(allowed_entity_types)]
filtered_entities

Unnamed: 0,label,ner_type
2,@mduscavage,PERSON
4,OP,PERSON
5,mduscavage,PERSON
17,REI,ORG
19,REI.com,ORG
22,U.S.,GPE
24,Toblerone,PRODUCT
29,Woot,ORG
30,Woot Services LLC,ORG
31,Deals.,ORG


In [167]:
candidate_list = []
candidate_indices = []
for entity in tqdm(entities_to_link, desc="Entity"):
    candidates = el._get_wikipedia_articles(entity)
    if candidates != []:
        candidate_indices.append(int(len(candidates)))

    candidate_list.extend(candidates)

Entity: 100%|██████████| 13/13 [00:05<00:00,  2.50it/s]


In [168]:
candidate_views = el.p.article_views("en.wikipedia", candidate_list, granularity="monthly", start="2022100100", end="2022103100")
print(list(candidate_views.values())[0])

{'OP': 3233, 'Op-ed': 18916, 'Cooperative_video_game': 8332, 'Op._cit.': 3196, 'Operational_amplifier': 36895, 'Takt_Op': 12025, 'Photo_op': 3353, 'Op_art': 19766, 'OP_Financial_Group': 2381, 'KO_OP': 467, 'REI': 13781, 'Rei': 1185, 'Rei_Rei': 776, 'Rei_Ayanami': 14435, 'Reis': 613, 'Cristo_Rei': 172, 'Rei_(given_name)': 3261, 'Rei_Harakami': 1050, 'Rei_Kawakubo': 9795, 'Sailor_Mars': 11012, 'Deus_Salve_o_Rei': 1776, 'Novara_(company)': 791, 'Lead_climbing': 5891, 'Figure_8_(belay_device)': 970, 'Compass': 46107, 'Sierra_Nevada': 32851, 'U.S._state': 361176, 'United_States': 1775677, 'Georgia_(U.S._state)': 189548, 'U.S._Agent': 12962, 'U.S._Steel': 22078, 'U.S._Cremonese': 32611, 'U.S._Lecce': 29031, 'U.S._Bancorp': 26614, 'U-S-A!': 2284, 'U.S._Woodland': 8811, 'Toblerone': 49590, 'Toblerone_line': 5114, "Dragon's_teeth_(fortification)": 39080, 'Mona_Sahlin': 2451, 'Theodor_Tobler': 940, "Terry's_Chocolate_Orange": 8462, 'Shrinkflation': 11350, 'Heroes_(confectionery)': 2032, 'Japanes

In [177]:
first_key = list(candidate_views.keys())[0]
candidate_tuple_array = np.array(list(candidate_views[first_key].items()))

# split candidate_views into chunks of size candidate_indices
candidate_tuple_array = np.split(candidate_tuple_array, np.cumsum(candidate_indices)[:-1])

for chunk in candidate_tuple_array:
    print(np.argmax(np.max(chunk[:][1], axis=0)))



TypeError: '>=' not supported between instances of 'str' and 'int'