# Full pipeline in action (beta version)
----------

This notebook contains a set up of the full pipeline

**Inputs:**

- 
**Outputs:**
- results.csv (triple per line)

##### Table of contents:
1. Setup
2. Scraping
3. NLP Preprocessing
4. Information extraction
    - Named entity recognition
    - Entity linking
    - Relation extraction Linking
    - Relation linking
----------

### 1. Setup

##### Imports, settings & constants

In [38]:
import spacy
import pandas as pd
import numpy as np
import re
import requests
import sys
from tqdm import tqdm
from pathlib import Path

import claucy


sys.path.insert(0, "../")
# Imports for NLP
from nlp import beautifulsoup as bsp
from nlp import nlp_preprocessing as nlp_prep
from nlp.read_warc2 import read_warc
from corpus_processing import relation_extraction as cre
from corpus_processing import entity_relation_coupling as erc
from corpus_processing import ner 
from corpus_processing import relation_extraction as re
from corpus_processing import relation_linking as rl
from corpus_processing import entity_linking as el 
from corpus_processing import coref as cr

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [40]:
# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
DIR_DATA = Path("../data/warcs")
FNAME_WARC = "sample.warc.gz"

### 1. Reading WARC files

In [5]:
warc_df = read_warc(DIR_DATA / FNAME_WARC)

reading warc file...
warc list ['clueweb12-0000tw-00-00006' 'clueweb12-0000tw-00-00043'
 'clueweb12-0000tw-00-00044' 'clueweb12-0000tw-00-00101'
 'clueweb12-0000tw-00-00129' 'clueweb12-0000tw-00-00130'
 'clueweb12-0000tw-00-00141' 'clueweb12-0000tw-00-00160'
 'clueweb12-0000tw-00-00161' 'clueweb12-0000tw-00-00162'
 'clueweb12-0000tw-00-00199' 'clueweb12-0000tw-00-00205'
 'clueweb12-0000tw-00-00210' 'clueweb12-0000tw-00-00212'
 'clueweb12-0000tw-00-00214' 'clueweb12-0000tw-00-00829'
 'clueweb12-0000tw-00-00833' 'clueweb12-0000tw-00-00834'
 'clueweb12-0000tw-00-00835' 'clueweb12-0000tw-00-00836'
 'clueweb12-0000tw-00-00837' 'clueweb12-0000tw-00-00843'
 'clueweb12-0000tw-00-00844' 'clueweb12-0000tw-00-00877'
 'clueweb12-0000tw-00-00879' 'clueweb12-0000tw-00-00880'
 'clueweb12-0000tw-00-00881' 'clueweb12-0000tw-00-00941'
 'clueweb12-0000tw-00-00942' 'clueweb12-0000tw-00-00943'
 'clueweb12-0000tw-00-00944' 'clueweb12-0000tw-00-00946']


In [6]:
warc_df

Unnamed: 0,HTML_DOC,WARC-TREC-ID
0,"<html>\r\n\r\n<head>\r\n<meta http-equiv=""Cont...",clueweb12-0000tw-00-00006
1,"<html>\n\n<head>\n\n<style type=""text/css"">\n\...",clueweb12-0000tw-00-00043
2,"<html>\n<head>\n<style type=""text/css"">\n#bott...",clueweb12-0000tw-00-00044
3,<html>\n<head>\n\n<title>DHP Concerts Bristol ...,clueweb12-0000tw-00-00101
4,<html>\n<head>\n <title>12 Tourist Spots in ...,clueweb12-0000tw-00-00129
5,<html>\r\n<head>\r\n<title>Saving Money Buying...,clueweb12-0000tw-00-00130
6,<html>\n<head>\n <title>Freelance Web Desig...,clueweb12-0000tw-00-00141
7,"<html>\n<head>\n<meta http-equiv=""Content-Type...",clueweb12-0000tw-00-00160
8,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00161
9,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00162


In [66]:
selected_doc = warc_df.iloc[18]

In [55]:
stripped_webpage = bsp.scrape_webpage(selected_doc['HTML_DOC'])

In [56]:
stripped_webpage

array(['\nMy account |\r\n            Log out\n',
       'Hi are you new? Start here.',
       'SAVE UP TO 50% On selected ski, snowboard and end-of-season clothing & gear.',
       '\r\n                by\r\n                \ncheshire255\n\r\n                added 8 hours ago\n',
       '@mduscavage Yes, that link works, but you have to copy and paste it, not click the link. Both links error when you just click them. But thanks for the help!',
       '\n\nthe18thtee84\n\n\n\n5 hours ago\n\n',
       'Try this link: http://www.rei.com/Sale+and+Clearance When I used the other I kept getting an error. Great sale, thanks OP!',
       '\n\nmduscavage\n\n\n\n5 hours ago\n\n',
       'hmm some of the things on the list arent listed on sale or clearance. At first glance, fivefingers and toe socks (which I was looking for!) were not discounted at all.',
       '\n\ndodey23\n\n\n\n3 hours ago\n\n',
       'Try this link: http://www.rei.com/Sale+and+Clearance When I used the other I kept getting

### 3. NLP Preprocessing

In [57]:
spacy_processor = spacy.load("en_core_web_md")
# nlp.add_pipe("entityLinker", last=True)  # entity linker
# claucy.add_to_pipe(nlp)  # Open IE

In [58]:
spacy_doc = nlp_prep.get_nlp_doc(stripped_webpage, spacy_processor)

In [59]:
processed_page = nlp_prep.nlp_preprocessing(spacy_doc)

### 4. Information extraction

##### 4.1 Named Entity Recognition

In [60]:
ner_page = ner.detect_entities(spacy_doc)

In [61]:
ner_page

Unnamed: 0,label,ner_type
0,SAVE UP,ORG
1,50%,PERCENT
2,clothing & gear,ORG
3,cheshire255,ORG
4,8 hours ago,TIME
5,first,ORDINAL
7,some $$,MONEY
8,REI,ORG
9,â¢Offer,PERSON
10,2/9/12 9pm,DATE


#### 4.2 Entity linking

In [115]:
warc_trec_id = [selected_doc['WARC-TREC-ID'] for x in range(len(ner_page))]

In [116]:
entities_to_link = ner_page['label'].to_list()
linked_entities = []
for entity in tqdm(entities_to_link, desc="Entity"):
    linked_entities.append(el.link_entity(entity))
entity_result = pd.concat([ner_page, pd.DataFrame(linked_entities), pd.DataFrame(warc_trec_id)], axis=1)

Entity: 100%|██████████| 25/25 [00:14<00:00,  1.70it/s]


In [114]:
entity_result.columns = ['entity_name', 'ner_type', 'wiki_link', 'warc_trec_id']
entity_result

ValueError: Length mismatch: Expected axis has 3 elements, new values have 4 elements

##### 4.2 Relation Extraction

In [77]:
coref_doc = cr.coref_resolution(spacy_doc)

In [78]:
relations = cre.extract_relations(coref_doc)

{}
(UP TO 50%, 8 hours ago, @mduscavage, 5 hours ago, @mduscavage, @mduscavage, OP, mduscavage, 5 hours ago, first, @mduscavage, 3 hours ago, @mduscavage, OP, mduscavage, 5 hours ago, mduscavage, 5 hours ago, first, @mduscavage, 3 hours ago, 3 hours ago, REI, 2/9/12 9, REI.com, 1-800-426-4840, REI, U.S., 2 hours ago, eckerput, Toblerone Chocolate, an hour ago, 34, 10, 4, Woot Services LLC, Deals.Deals, Deals, Deals, Deals, Deals, Woot, Inc., 2012, Dealsin, Woot, Deals, Deals, REI, Deals, REI, REI, REI, REI, Deals.Deals, Deals, Deals, Deals, five minutes)


In [79]:
relations

[('Start here', None, UP TO 50%),
 ('SAVE UP', None, 8 hours ago),
 ('selected', UP TO 50%, 8 hours ago),
 ('added', UP TO 50%, 8 hours ago),
 ('works Both', @mduscavage, 5 hours ago),
 ('have to', @mduscavage, 5 hours ago),
 ('copy', @mduscavage, 5 hours ago),
 ('paste that', @mduscavage, 5 hours ago),
 ('click that', @mduscavage, 5 hours ago),
 ('works Both', @mduscavage, 5 hours ago),
 ('click that', @mduscavage, 5 hours ago),
 ('Try this', 5 hours ago, @mduscavage),
 ('getting an', @mduscavage, OP),
 ('listed on', 5 hours ago, first),
 ('looking for', @mduscavage, 3 hours ago),
 ('discounted at all', @mduscavage, 3 hours ago),
 ('Try this', 5 hours ago, @mduscavage),
 ('getting an', @mduscavage, OP),
 ('have to', @mduscavage, 5 hours ago),
 ('copy', @mduscavage, 5 hours ago),
 ('paste that', @mduscavage, 5 hours ago),
 ('click that', @mduscavage, 5 hours ago),
 ('works Both', @mduscavage, 5 hours ago),
 ('click that', @mduscavage, 5 hours ago),
 ('listed on', 5 hours ago, first),
 

In [84]:
# Turn list of tuples into dataframe
relations_df = pd.DataFrame(relations, columns=['relation', 'subject', 'object'])
relations_df = relations_df.dropna()

In [85]:
relations_df['object'] = relations_df['object'].apply(lambda x: x.text)
relations_df['subject'] = relations_df['subject'].apply(lambda x: x.text)

In [86]:
object_entities = relations_df['object'].to_list()
subject_entities = relations_df['subject'].to_list()
obj_ents, subj_ents = [], []
print(f'Linking object entities')
for entity in tqdm(object_entities):
    obj_ents.append(el.link_entity(entity))
print(f'Linking subject entities')
for entity in tqdm(subject_entities):
    subj_ents.append(el.link_entity(entity))

Linking object entities


100%|██████████| 105/105 [00:59<00:00,  1.77it/s]


Linking subject entities


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Making sure all entities in the relations appear in the found entities

In [89]:
relations_df['subject_wiki'] = subj_ents
relations_df['object_wiki'] = obj_ents
relations_df

[autoreload of corpus_processing.relation_extraction failed: Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 257, in check
    superreload(m, reload, self.old_objects)
  File "/opt/homebrew/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 455, in superreload
    module = reload(module)
  File "/opt/homebrew/Cellar/python@3.10/3.10.4/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/falksinke/Library/Mobile Documents/com~apple~CloudDocs/02B-Study/02VU/MSc BA/jaar2/WebData-Processing-Tech/WebDataProcessingSystems/notebooks/../corpus_processing/relation_extraction.py", line 98, in <mod

Unnamed: 0,relation,subject,object,object_wiki,subject_wiki
2,selected,UP TO 50%,8 hours ago,https://en.wikipedia.org/wiki/Year,https://en.wikipedia.org/wiki/Gmail
3,added,UP TO 50%,8 hours ago,https://en.wikipedia.org/wiki/Windows_8,https://en.wikipedia.org/wiki/Gmail
4,works Both,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
5,have to,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
6,copy,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
7,paste that,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
8,click that,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
9,works Both,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
10,click that,@mduscavage,5 hours ago,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V,
11,Try this,5 hours ago,@mduscavage,,https://en.wikipedia.org/wiki/Grand_Theft_Auto_V


In [92]:
relations_df.loc[(relations_df['object_wiki'].isin(linked_entities)) & (relations_df['subject_wiki'].isin(linked_entities)) , :]

Unnamed: 0,relation,subject,object,object_wiki,subject_wiki
37,operated by,4,Woot Services LLC,https://en.wikipedia.org/wiki/Amazon_(company),https://en.wikipedia.org/wiki/4
54,logged in with the same,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami
61,Deals,4,Woot Services LLC,https://en.wikipedia.org/wiki/Amazon_(company),https://en.wikipedia.org/wiki/4
70,bought,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami
72,make sure,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami
73,logged in with the same,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami
74,used to,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami
75,buy from,REI,REI,https://en.wikipedia.org/wiki/Rei_Ayanami,https://en.wikipedia.org/wiki/Rei_Ayanami


In [111]:
# converta all to string
relations_df['subject_wiki'] = relations_df['subject_wiki'].astype(str)
relations_df['object_wiki'] = relations_df['object_wiki'].astype(str)

entity_result['wiki_link'] = entity_result['wiki_link'].astype(str)
entity_result['label'] = entity_result['entity_name'].astype(str)

# lstrip wiki links
relations_df['subject_wiki'] = relations_df['subject_wiki'].apply(lambda x: x.lstrip(x))
relations_df['object_wiki'] = relations_df['object_wiki'].apply(lambda x: x.lstrip(x))
entity_result['wiki_link'] = entity_result['wiki_link'].apply(lambda x: x.lstrip(x))

In [113]:
entity_result

Unnamed: 0,warc_trec_id,entity_name,wiki_link
0,clueweb12-0000tw-00-00835,SAVE UP,
1,clueweb12-0000tw-00-00835,50%,
2,clueweb12-0000tw-00-00835,clothing & gear,
3,clueweb12-0000tw-00-00835,cheshire255,
4,clueweb12-0000tw-00-00835,8 hours ago,
5,clueweb12-0000tw-00-00835,first,
7,clueweb12-0000tw-00-00835,some $$,
8,clueweb12-0000tw-00-00835,REI,
9,clueweb12-0000tw-00-00835,â¢Offer,
10,clueweb12-0000tw-00-00835,2/9/12 9pm,


In [112]:
entity_result = entity_result[['warc_trec_id', 'entity_name', 'wiki_link']]
for entity in entity_result.iterrows():
    print(f'ENTITY:{entity[1]["warc_trec_id"]}	{entity[1]["entity_name"]}	{entity[1]["wiki_link"]}')

ENTITY:clueweb12-0000tw-00-00835	SAVE UP	
ENTITY:clueweb12-0000tw-00-00835	50%	
ENTITY:clueweb12-0000tw-00-00835	clothing & gear	
ENTITY:clueweb12-0000tw-00-00835	cheshire255	
ENTITY:clueweb12-0000tw-00-00835	8 hours ago	
ENTITY:clueweb12-0000tw-00-00835	first	
ENTITY:clueweb12-0000tw-00-00835	some $$	
ENTITY:clueweb12-0000tw-00-00835	REI	
ENTITY:clueweb12-0000tw-00-00835	â¢Offer	
ENTITY:clueweb12-0000tw-00-00835	2/9/12 9pm	
ENTITY:clueweb12-0000tw-00-00835	2/20/12 11:59pm	
ENTITY:clueweb12-0000tw-00-00835	REI.com	
ENTITY:clueweb12-0000tw-00-00835	1-800-426	
ENTITY:clueweb12-0000tw-00-00835	U.S.	
ENTITY:clueweb12-0000tw-00-00835	Toblerone Chocolate	
ENTITY:clueweb12-0000tw-00-00835	madcow19



	
ENTITY:clueweb12-0000tw-00-00835	34	
ENTITY:clueweb12-0000tw-00-00835	10	
ENTITY:clueweb12-0000tw-00-00835	4	
ENTITY:clueweb12-0000tw-00-00835	Woot	
ENTITY:nan	Woot Services LLC	
ENTITY:nan	Wootâs	
ENTITY:nan	a Quality Post	
ENTITY:nan	2004-2012	
ENTITY:nan	five minutes	
ENTITY:clueweb12-000

In [None]:
def generate_results(entity_result, relations_df):
    
    entity_re

### 5. Linking