In [1]:
import json
import requests
import glob

In [2]:
relations = json.loads('[{ "subject": "Q76", "predictedPredicate": "P26", "obj": "Q13133", "sentence": "bla bla", "source": "the user page", "probability": "0.99" }]')    

relations[0]['subject']
relations.append({"subject": "Q76", "predictedPredicate": "P26", "obj": "Q13133", "sentence": "another sentence", "source": "the user page", "probability": "0.80" })
relations.append({"subject": "Q76", "predictedPredicate": "P19", "obj": "Q234", "sentence": "unrelated", "source":"hej", "probability": "0.3"})
relations.append({"subject": "Q76", "predictedPredicate": "P19", "obj": "Q234", "sentence": "unrelated 2", "source":"hej 2", "probability": "0.45"})
relations.append({'subject': 'Q76', 'predictedPredicate': 'P26', 'obj': 'Q11696', 'sentence': "barack is married to his job", 'source': 'urn:wikidata:Q76', 'probability': 0.8461098093518024})
relations

[{'obj': 'Q13133',
  'predictedPredicate': 'P26',
  'probability': '0.99',
  'sentence': 'bla bla',
  'source': 'the user page',
  'subject': 'Q76'},
 {'obj': 'Q13133',
  'predictedPredicate': 'P26',
  'probability': '0.80',
  'sentence': 'another sentence',
  'source': 'the user page',
  'subject': 'Q76'},
 {'obj': 'Q234',
  'predictedPredicate': 'P19',
  'probability': '0.3',
  'sentence': 'unrelated',
  'source': 'hej',
  'subject': 'Q76'},
 {'obj': 'Q234',
  'predictedPredicate': 'P19',
  'probability': '0.45',
  'sentence': 'unrelated 2',
  'source': 'hej 2',
  'subject': 'Q76'},
 {'obj': 'Q11696',
  'predictedPredicate': 'P26',
  'probability': 0.8461098093518024,
  'sentence': 'barack is married to his job',
  'source': 'urn:wikidata:Q76',
  'subject': 'Q76'}]

In [3]:
data_cache = []

In [20]:
def check_relation(relation):
    sub = relation['subject']
    obj = relation['obj']
    pred = relation['predictedPredicate']

    if not data_cache:
        # read data
        files = glob.glob("extractions/part-*")
        for path in files:
            with open(path) as file:
                lines = file.readlines()
                data_cache.extend([json.loads(l) for l in lines])
    matches = [match for match in data_cache if match['predictedPredicate'] == pred and match['subject'] == sub]
    
    if len(matches) == 0:
        return ("unknown", [])
    
    for match in matches:
        if match['obj'] == obj:
            # found one match, the relation is considered True
            return ("verified", [match])

    return ("conflicting", matches)

In [22]:
# group extracted relations together
def keyfunc(relation):
    return (relation['subject'], relation['predictedPredicate'], relation['obj'])
from itertools import groupby
extractions = []
uniquekeys = []
data = sorted(relations, key=keyfunc)
for k, g in groupby(data, keyfunc):
    extractions.append(list(g))      # Store group iterator as a list
    uniquekeys.append(k)

In [23]:
cache = {}
def label_for(q):
    q = q.upper()
    if q in cache:
        return cache[q]
    url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&props=labels&ids=%s&languages=en&format=json' % q
    resp = requests.get(url)
    v = resp.json()['entities'][q]['labels']['en']['value']
    cache[q] = v
    return v

In [26]:
def link_for(q):
    return f"https://www.wikidata.org/wiki/{q}"
def wiki_link_for(name):
    name_quoted = name_quoated.replace(" ", "_")
    return f"https://en.wikipedia.org/wiki/{name_quoted}"

results = []
# result will have structure: [(('labelQ1', 'labelP', 'labelQ2'))]

def trim_evidence(evidence):
    name = evidence['source'].split(":")

    if len(name) == 3:
        name = name[2]
    else:
        name = ""
        
    return {
        'subject': label_for(evidence['subject']),
        'object': label_for(evidence['obj']),
        'predicate': label_for(evidence['predictedPredicate']),
        'snippet': label_for(evidence['sentence']),
        'link': wiki_link_for(label_for(name))
    }

for extraction in extractions:
    # only check once per actual relation
    print(extraction[0])
    evidence = check_relation(extraction[0])
    print(evidence)
    result = {}
    result['subject'] = {
        'name': label_for(extraction[0]['subject']),
        'link': link_for(extraction[0]['subject'])
    }
    result['object'] = {
        'name': label_for(extraction[0]['obj']),
        'link': link_for(extraction[0]['obj'])
    }
    result['predicate'] = {
        'name': label_for(extraction[0]['predictedPredicate']),
        'link': link_for(extraction[0]['predictedPredicate'])
    }
    result['sentences'] = list(map(lambda r: r['sentence'], extraction))
    result['type'] = evidence[0]
    for match in evidence[1]:
        match['subject'] = label_for(match['subject'])
        match['predictedPredicate'] = label_for(match['predictedPredicate'])
        match['obj'] = label_for(match['obj'])

    result['evidence'] = list(map(lambda evidence: trim_evidence(evidence), evidence[1])) 
    
    results.append(result)

results

{'subject': 'Q76', 'predictedPredicate': 'P19', 'obj': 'Q234', 'sentence': 'unrelated', 'source': 'hej', 'probability': '0.3'}
('unknown', [])
{'subject': 'Q76', 'predictedPredicate': 'P26', 'obj': 'Q11696', 'sentence': 'barack is married to his job', 'source': 'urn:wikidata:Q76', 'probability': 0.8461098093518024}
('unknown', [])
{'subject': 'Q76', 'predictedPredicate': 'P26', 'obj': 'Q13133', 'sentence': 'bla bla', 'source': 'the user page', 'probability': '0.99'}
('unknown', [])


[{'evidence': [],
  'object': {'link': 'https://www.wikidata.org/wiki/Q234',
   'name': 'Flemish Region'},
  'predicate': {'link': 'https://www.wikidata.org/wiki/P19',
   'name': 'place of birth'},
  'sentences': ['unrelated', 'unrelated 2'],
  'subject': {'link': 'https://www.wikidata.org/wiki/Q76',
   'name': 'Barack Obama'},
  'type': 'unknown'},
 {'evidence': [],
  'object': {'link': 'https://www.wikidata.org/wiki/Q11696',
   'name': 'President of the United States of America'},
  'predicate': {'link': 'https://www.wikidata.org/wiki/P26', 'name': 'spouse'},
  'sentences': ['barack is married to his job'],
  'subject': {'link': 'https://www.wikidata.org/wiki/Q76',
   'name': 'Barack Obama'},
  'type': 'unknown'},
 {'evidence': [],
  'object': {'link': 'https://www.wikidata.org/wiki/Q13133',
   'name': 'Michelle Obama'},
  'predicate': {'link': 'https://www.wikidata.org/wiki/P26', 'name': 'spouse'},
  'sentences': ['bla bla', 'another sentence'],
  'subject': {'link': 'https://www.wi

In [4]:
from bs4 import BeautifulSoup
import requests

In [31]:
page = requests.get('http://www.breitbart.com/jerusalem/2017/06/09/iran-turns-wrath-u-s-saudi-attacks/').text

soup = BeautifulSoup(page, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# dc: http://purl.org/dc/elements/1.1/# v: http://rdf.data-vocabulary.org/# op: http://media.facebook.com/op#" xml:lang="en">
 <head>
  <meta charset="utf-8">
   <meta content="width=device-width,initial-scale=1" name="viewport">
    <title>
     Iran Turns Wrath on U.S., Saudi Over Terror Attacks
    </title>
    <link href="//media.breitbart.com" rel="preconnect">
     <link href="//breitbartproduction.disqus.com" rel="dns-prefetch">
      <link as="style" crossorigin="" href="https://fonts.googleapis.com/css?family=Fjalla+One" rel="preload">
       <link as="font" crossorigin="" href="/t/assets/fonts/bbn.woff2" rel="preload" type="font/woff2">
        <link as="style" href="/t/style-014970640.css" rel="preload">
         <script>
          if ( ! document.cookie || document.cookie.indexOf('Cs04') === -1 ) { var a=document.createElement("link");a.rel="preload";a.as="image";a.href="/t/assets

In [34]:
import re

#[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]

#visible_text = soup.getText()
result = "\n".join([p.getText().strip() for p in soup.find_all('p')])
result

"by Breitbart Jerusalem9 Jun 20170\nSIGN UP FOR OUR NEWSLETTER\nThe assault by gunmen and suicide bombers Wednesday on Tehran’s parliament complex and the shrine of revolutionary leader Ayatollah Ruhollah Khomeini killed 17 people and wounded more than 50.\nMourners were paying respects to those killed at a ceremony in parliament on Friday morning, in the presence of newly re-elected moderate President Hassan Rouhani.\nSupreme leader Ayatollah Ali Khamenei initially played down the attacks this week, describing them as “firecrackers” that “will not have the slightest effect on the will of the people”.\nAt Friday’s funeral, however, he turned his wrath for the attacks on the United States and Saudi Arabia, his country’s fiercest rivals.\n“Such acts will have no other result than to reinforce hatred for the US government and its agents in the region, like the Saudi (government),” Khamenei wrote in a message of condolence to the families of the dead.\nParliament speaker Ali Larijani also 

In [70]:
text = "Here is a sentence. And another. And two. Threee sentencenes is too much almost. But certainly four at a time is excessive?"

In [83]:
def chunk_text(text, chunks):
    if not text:
        return chunks
    else:
        chunk = ".".join(text.split(".")[:2]) + "."
        rest = ".".join(text.split(".")[2:])
        return chunk_text(rest, chunks + [chunk])
    
chunk_text(text, [])



['Here is a sentence. And another.',
 ' And two. Threee sentencenes is too much almost.',
 ' But certainly four at a time is excessive?.']

In [78]:
res = []
res.append("hsj")
res

['hsj']