# Full pipeline in action (beta version)
----------

This notebook contains a set up of the full pipeline

**Inputs:**

- 
**Outputs:**
- results.csv (triple per line)

##### Table of contents:
1. Setup
2. Scraping
3. NLP Preprocessing
4. Information extraction
    -  Named entity recognition
    - Relation extraction
5. Linking
    - Entity linking
    - Relation linking
----------

### 1. Setup

##### Imports, settings & constants

In [1]:
import spacy
import pandas as pd
import numpy as np
import re
import requests
import sys
from tqdm import tqdm
from pathlib import Path

import claucy


sys.path.insert(0, "../")
# Imports for NLP
from nlp import beautifulsoup as bsp
from nlp import nlp_preprocessing as nlp_prep
from nlp.read_warc2 import read_warc
from corpus_processing import relation_extraction as cre
from corpus_processing import entity_relation_coupling as erc
from corpus_processing import ner 
from corpus_processing import relation_extraction as re
from corpus_processing import relation_linking as rl
from corpus_processing import entity_linking as el 
from corpus_processing import coref as cr

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# autoreload
%load_ext autoreload
%autoreload 2

In [4]:
DIR_DATA = Path("../data/warcs")
FNAME_WARC = "sample.warc.gz"

### 1. Reading WARC files

In [5]:
warc_df = read_warc(DIR_DATA / FNAME_WARC)

reading warc file...
warc list ['clueweb12-0000tw-00-00006' 'clueweb12-0000tw-00-00043'
 'clueweb12-0000tw-00-00044' 'clueweb12-0000tw-00-00101'
 'clueweb12-0000tw-00-00129' 'clueweb12-0000tw-00-00130'
 'clueweb12-0000tw-00-00141' 'clueweb12-0000tw-00-00160'
 'clueweb12-0000tw-00-00161' 'clueweb12-0000tw-00-00162'
 'clueweb12-0000tw-00-00199' 'clueweb12-0000tw-00-00205'
 'clueweb12-0000tw-00-00210' 'clueweb12-0000tw-00-00212'
 'clueweb12-0000tw-00-00214' 'clueweb12-0000tw-00-00829'
 'clueweb12-0000tw-00-00833' 'clueweb12-0000tw-00-00834'
 'clueweb12-0000tw-00-00835' 'clueweb12-0000tw-00-00836'
 'clueweb12-0000tw-00-00837' 'clueweb12-0000tw-00-00843'
 'clueweb12-0000tw-00-00844' 'clueweb12-0000tw-00-00877'
 'clueweb12-0000tw-00-00879' 'clueweb12-0000tw-00-00880'
 'clueweb12-0000tw-00-00881' 'clueweb12-0000tw-00-00941'
 'clueweb12-0000tw-00-00942' 'clueweb12-0000tw-00-00943'
 'clueweb12-0000tw-00-00944' 'clueweb12-0000tw-00-00946']


In [6]:
warc_df

Unnamed: 0,HTML_DOC,WARC-TREC-ID
0,"<html>\r\n\r\n<head>\r\n<meta http-equiv=""Cont...",clueweb12-0000tw-00-00006
1,"<html>\n\n<head>\n\n<style type=""text/css"">\n\...",clueweb12-0000tw-00-00043
2,"<html>\n<head>\n<style type=""text/css"">\n#bott...",clueweb12-0000tw-00-00044
3,<html>\n<head>\n\n<title>DHP Concerts Bristol ...,clueweb12-0000tw-00-00101
4,<html>\n<head>\n <title>12 Tourist Spots in ...,clueweb12-0000tw-00-00129
5,<html>\r\n<head>\r\n<title>Saving Money Buying...,clueweb12-0000tw-00-00130
6,<html>\n<head>\n <title>Freelance Web Desig...,clueweb12-0000tw-00-00141
7,"<html>\n<head>\n<meta http-equiv=""Content-Type...",clueweb12-0000tw-00-00160
8,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00161
9,"<html>\n<head>\n <meta http-equiv=""Content-...",clueweb12-0000tw-00-00162


In [7]:
stripped_webpages = [bsp.scrape_webpage(webpage) for webpage in warc_df['HTML_DOC']]

### 3. NLP Preprocessing

In [8]:
spacy_processor = spacy.load("en_core_web_md")
# nlp.add_pipe("entityLinker", last=True)  # entity linker
# claucy.add_to_pipe(nlp)  # Open IE

In [9]:
spacy_docs = [nlp_prep.get_nlp_doc(page, spacy_processor) for page in stripped_webpages]

In [10]:
processed_pages = [nlp_prep.nlp_preprocessing(doc) for doc in spacy_docs]

### 4. Information extraction

##### 4.1 Named Entity Recognition

In [11]:
ner_pages = [ner.detect_entities(doc) for doc in spacy_docs]

#### 4.2 Entity linking

In [13]:
linked_pages = []
for document in tqdm(ner_pages, desc="Document"):
    new_doc = document.copy()
    entities_to_link = document['label'].to_list()
    linked_entities = []
    for entity in tqdm(entities_to_link, desc="Entity", leave=False):
        linked_entities.append(el.link_entity(entity))
    new_doc = pd.concat([new_doc, pd.DataFrame(linked_entities)], axis=1)
    linked_pages.append(new_doc)

Document: 100%|██████████| 32/32 [03:31<00:00,  6.62s/it]


##### 4.2 Relation Extraction

In [21]:
relations = [cre.extract_relations(doc) for doc in spacy_docs]

{}
(First, first, first)
{}
(5$, 1$, 90$, 5$    

					|   , 1$    

					|, 90$ 



					- Signup, 1$, end of month, 90$, Alertpay, Liberty, reserve, & Western Union)
{}
(5$, 1$, 90$, 5$    
					|   , 1$    
					|, 90$)
{}
(DHP Concerts, Bristol, Bristol)
{}
(The Serious Eats Team, July 25, 2011, 10:30 AM, first, New York, Norma, Magnolia, late morning, Lombardi, Carmine, Serendipity 3, NYC, tomorrow, NYC, Post a Comment



Favorite, 83, Balthazar, Doughnut Plant, Eataly, Grand Central Oyster Bar, Gray's Papaya, Joe's, Katz's, Murray, Bagel, Papaya King, Peter Luger, Pommes Frites, Shake Shack, Spotted Pig, a moment, up to a minute, Chicken Wingsssss!, 0, NYC, 2, Indian Restaurant, Jackson Heights, NY, 13, No More Whipped Cream, Shake Shack, 8, NY 3, New York Penn Station, 14, New York, The Vegan Experience, Day 24, 8, Los Angeles, El Taurino's, Salsa Rojo, Better Than the Actual, Tacos, 0, Gluten-Free, Tuesday, 0, Gluten-Free Peanut Butter Chocolate Chip Bars, 0, Atlanta, 2, Eats, 

In [22]:
relations

[[('Avoid', None, First),
  ('Creating', None, First),
  ('said', First, first),
  ('get', First, first),
  ('commit', first, first),
  ('publish', first, first),
  ('help', first, None),
  ('minimize those', first, None),
  ('make the whole', first, None),
  ('Read the', first, None),
  ('Related Articles:-', first, None)],
 [('joining', None, 5$),
  ('=', None, 5$),
  ('=', None, 5$),
  ('leading',
   90$,
   5$    
   
   					|   ),
  ('earning',
   90$,
   5$    
   
   					|   ),
  ('sharing referral',
   90$,
   5$    
   
   					|   ),
  ('get',
   90$ 
   
   
   
   					- Signup,
   1$),
  ('earning',
   90$,
   5$    
   
   					|   ),
  ('visit', 5$, 1$),
  ('Get', 1$, end of month),
  ('refer the referral', & Western Union, None),
  ('get',
   90$ 
   
   
   
   					- Signup,
   1$),
  ('earning',
   90$,
   5$    
   
   					|   ),
  ('open', & Western Union, None)],
 [('joining', None, 5$),
  ('=', None, 5$),
  ('=', None, 5$),
  ('leading',
   90$,
   5$    
   	

In [23]:
# Turn list of tuples into dataframe
relations_frames = []
for relation_set in relations:
    relations_df = pd.DataFrame(relation_set, columns=['relation', 'object', 'subject'])
    relations_df['object'] = relations_df['object'].apply(lambda x: x.text)
    relations_df['subject'] = relations_df['subject'].apply(lambda x: x.text)
    relations_frames.append(relations_df)

AttributeError: 'NoneType' object has no attribute 'text'

In [33]:
object_entities = relations_df['object'].to_list()
subject_entities = relations_df['subject'].to_list()
obj_ents, subj_ents = [], []
print(f'Linking object entities')
for entity in tqdm(object_entities):
    obj_ents.append(el.link_entity(entity))
print(f'Linking subject entities')
for entity in tqdm(subject_entities):
    subj_ents.append(el.link_entity(entity))

Linking object entities
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 145/145 [01:22<00:00,  1.75it/s]


Linking subject entities


100%|██████████| 145/145 [01:25<00:00,  1.70it/s]


Making sure all entities in the relations appear in the found entities

In [36]:
relations_df['object_wiki'] = obj_ents
relations_df['subject_wiki'] = subj_ents
relations_df

Unnamed: 0,relation,object,subject,object_wiki,subject_wiki
0,convicted of,Donald Trump's,Tuesday,https://en.wikipedia.org/wiki/Donald_Trump,https://en.wikipedia.org/wiki/Tuesday_Weld
1,brought by the,Tuesday,Manhattan,https://en.wikipedia.org/wiki/Tuesday_Weld,https://en.wikipedia.org/wiki/New_York_City
2,found in the,Trump,Constitution,https://en.wikipedia.org/wiki/Donald_Trump,https://en.wikipedia.org/wiki/U.S._state
3,including,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
4,falsifying,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
...,...,...,...,...,...
140,Join the,our Submission Guidelines,Audience Relations,https://en.wikipedia.org/wiki/BDSM,https://en.wikipedia.org/wiki/Public_relations
141,create a,CBC,Canadians,https://en.wikipedia.org/wiki/Columbia_Pictures,https://en.wikipedia.org/wiki/Canada
142,including,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
143,Described,Canadians,CBC,https://en.wikipedia.org/wiki/Canada,https://en.wikipedia.org/wiki/Columbia_Pictures


In [54]:
relations_df.loc[(relations_df['object_wiki'].isin(linked_entities)) & (relations_df['subject_wiki'].isin(linked_entities)) , :]

Unnamed: 0,relation,object,subject,object_wiki,subject_wiki
0,convicted of,Donald Trump's,Tuesday,https://en.wikipedia.org/wiki/Donald_Trump,https://en.wikipedia.org/wiki/Tuesday_Weld
1,brought by the,Tuesday,Manhattan,https://en.wikipedia.org/wiki/Tuesday_Weld,https://en.wikipedia.org/wiki/New_York_City
2,found in the,Trump,Constitution,https://en.wikipedia.org/wiki/Donald_Trump,https://en.wikipedia.org/wiki/U.S._state
3,including,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
4,falsifying,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
...,...,...,...,...,...
140,Join the,our Submission Guidelines,Audience Relations,https://en.wikipedia.org/wiki/BDSM,https://en.wikipedia.org/wiki/Public_relations
141,create a,CBC,Canadians,https://en.wikipedia.org/wiki/Columbia_Pictures,https://en.wikipedia.org/wiki/Canada
142,including,17,the second day,https://en.wikipedia.org/wiki/1,https://en.wikipedia.org/wiki/Battle_of_Thermo...
143,Described,Canadians,CBC,https://en.wikipedia.org/wiki/Canada,https://en.wikipedia.org/wiki/Columbia_Pictures


### 5. Linking