In [1]:
from tqdm.autonotebook import tqdm

import findspark
findspark.init()
from pyspark import SparkContext
import pyspark
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '2'),('spark.executor.instances','7'), ('spark.driver.memory','150g'), ('spark.driver.maxResultSize','100g')])
sc = SparkContext(conf=conf)

from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType, StringType

from pyspark.sql.types import Row
from pyspark.sql import SparkSession
spark = SparkSession(sc)

import json
import numpy as np
import re

from operator import add

from urllib.parse import unquote

  """Entry point for launching an IPython kernel.


In [7]:
def numpy_describe(array):
    print('count', len(array))
    print('min',np.min(array))
    print('max',np.max(array))
    print('mean',np.mean(array))
    print('std',np.std(array))
    print('10%',np.percentile(array,10))
    print('25%',np.percentile(array,25))
    print('50%',np.percentile(array,50))
    print('60%',np.percentile(array,60))
    print('75%',np.percentile(array,75))
    print('80%',np.percentile(array,80))
    print('90%',np.percentile(array,90))

In [6]:
# you can create the index-enwiki dump use this library https://github.com/jcklie/wikimapper
wikipedia_wikidata_mapping = spark.read.format("jdbc").options(url ="jdbc:sqlite:/data/deng.595/workspace/wikimapper/index/index_enwiki-latest.db", driver="org.sqlite.JDBC", dbtable="mapping").load()
wikipedia_wikidata_mapping.show()

+------------+--------------------+-----------+
|wikipedia_id|     wikipedia_title|wikidata_id|
+------------+--------------------+-----------+
|          10| AccessibleComputing|   Q3097841|
|          12|           Anarchism|      Q6199|
|          13|  AfghanistanHistory|    Q188872|
|          14|AfghanistanGeography|   Q1637198|
|          15|   AfghanistanPeople|   Q1075999|
|          18|AfghanistanCommun...|   Q2658920|
|          19|AfghanistanTransp...|    Q509443|
|          20| AfghanistanMilitary|  Q11062919|
|          21|AfghanistanTransn...|   Q4113710|
|          23| AssistiveTechnology|    Q688498|
|          24|        AmoeboidTaxa|    Q506524|
|          25|              Autism|     Q38404|
|          27|      AlbaniaHistory|    Q213833|
|          29|       AlbaniaPeople|    Q583150|
|          30|        AsWeMayThink|    Q610709|
|          35|   AlbaniaGovernment|    Q917351|
|          36|      AlbaniaEconomy|      Q8055|
|          39|              Albedo|    Q

In [9]:
# we use dbpedia abstracts and types, so information related to freebase can be ignored
wiki_mid_mapping = spark.createDataFrame(
                    sc.textFile('../../../freebase_utils/freebase_dumped/mid2wiki.txt')\
                    .map(lambda x:x.split())\
                    .map(lambda x:Row(
                        wikipedia_id=int(x[1]),
                        freebase_mid=x[0])))

In [26]:
dbpedia_types = dict(spark.createDataFrame(sc.textFile('../../../freebase_utils/dbpedia_2019_08_30/instance_types_en.ttl')\
                .map(lambda x:x.split())\
                .map(lambda x:Row(wikipedia_title=unquote(x[0][1:-1]).replace('http://dbpedia.org/resource/',''),\
                                  type=x[2][1:-1].split('/')[-1])))\
                .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')\
                .rdd.map(lambda x:(x['wikidata_id'],[x['type']])).reduceByKey(add).collect())
print(len(dbpedia_types))

3132402


In [40]:
dbpedia_abstract = dict(spark.createDataFrame(sc.textFile('../../../freebase_utils/dbpedia_2019_08_30/short_abstracts_en.ttl')\
                .map(lambda x:re.match(r'(<.+>) (<.+>) (\".+\")',x)).filter(lambda x:x is not None)\
                .map(lambda x:Row(wikipedia_title=unquote(x.group(1)[1:-1]).replace('http://dbpedia.org/resource/',''),\
                                  abstract=x.group(3)[1:-1].replace("\\\"","\""))))\
                .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')\
                .rdd.map(lambda x:(x['wikidata_id'],x['abstract'])).collect())
print(len(dbpedia_abstract))

4714263


In [103]:
dbpedia_abstract.show()

+--------------------+--------------------+
|            abstract|     wikipedia_title|
+--------------------+--------------------+
|!!! is a dance-pu...|                 !!!|
|!!! is the eponym...|         !!!_(album)|
|!!Destroy-Oh-Boy!...|  !!Destroy-Oh-Boy!!|
|!Action Pact! wer...|       !Action_Pact!|
|!Arriba! La Pacha...|!Arriba!_La_Pachanga|
|!HERO is a rock o...|               !Hero|
|!Hero is an album...|       !Hero_(album)|
|!Kung /ˈkʊŋ/ (!Xu...|      !Kung_language|
|!Oka Tokat is a p...|          !Oka_Tokat|
|!PAUS3, or THEE P...|              !PAUS3|
|!T.O.O.H.! (an ac...|          !T.O.O.H.!|
|!WOWOW! is a coll...|             !WOWOW!|
|!Women Art Revolu...|!Women_Art_Revolu...|
|! is an album by ...|!_(The_Dismemberm...|
|$1,000 Reward is ...|       $1,000_Reward|
|$1,000 a Touchdow...|  $1,000_a_Touchdown|
|The $1,000 genome...|       $1,000_genome|
|$1.99 Romances is...|      $1.99_Romances|
|The $100,000 Fort...|$100,000_Fortune_...|
|The $100,000 infi...|    $100,0

In [49]:
fb_en_types = spark.createDataFrame(sc.textFile("/data/deng.595/workspace/freebase_utils/freebase_dumped/fb_en_types.txt")\
                            .map(lambda x:x.split('\t'))\
                            .map(lambda x:Row(freebase_mid=x[0], types=[z for z in json.loads(x[1]) if (not z.startswith('user.') and not z.startswith('base.') and not z.startswith('common.'))])))

In [400]:
# load the raw tables
data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
train_tables = sc.textFile(data_dir+"train_tables.jsonl").map(lambda x:json.loads(x))
val_tables = sc.textFile(data_dir+"dev_tables.jsonl").map(lambda x:json.loads(x))
test_tables = sc.textFile(data_dir+"test_tables.jsonl").map(lambda x:json.loads(x))

In [401]:
test_tables.map(lambda x:x['_id']).count()

4964

In [402]:
def get_mentions(table):
    results = []
    entity_columns = table.get("entityColumn", [])
    entity_cells = np.array(table.get("entityCell",[[]]))
    rows = table.get("tableData", {})
    num_rows = len(rows)
    num_columns = len(rows[0])
    entities = set()
    for i in range(num_rows):
        for j in entity_columns:
            if entity_cells[i,j] == 1:
                results.append(Row(
                    table_id=table['_id'],
                    table_pgTitle=table['pgTitle'],
                    i=i,j=j,
                    mention=rows[i][j]['surfaceLinks'][0]['surface'],
                    wikipedia_id=rows[i][j]['surfaceLinks'][0]['target']['id'],
                    wikipedia_title=rows[i][j]['surfaceLinks'][0]['target']['title']
                ))
    return results

In [403]:
# data for ours
train_mentions= spark.createDataFrame(train_tables.flatMap(get_mentions))
val_mentions= spark.createDataFrame(val_tables.flatMap(get_mentions))
test_mentions= spark.createDataFrame(test_tables.flatMap(get_mentions))

In [14]:
train_mentions.show()

+---+---+--------------------+---------+--------------------+------------+--------------------+
|  i|  j|             mention| table_id|       table_pgTitle|wikipedia_id|     wikipedia_title|
+---+---+--------------------+---------+--------------------+------------+--------------------+
|  0|  0|Public Against Vi...|2728176-1|Slovak parliament...|     1314381|Public_Against_Vi...|
|  1|  0|Christian Democra...|2728176-1|Slovak parliament...|      482855|Christian_Democra...|
|  2|  0|Slovak National P...|2728176-1|Slovak parliament...|     1381773|Slovak_National_P...|
|  3|  0|Communist Party o...|2728176-1|Slovak parliament...|    11450391|Communist_Party_o...|
|  4|  0|         Coexistence|2728176-1|Slovak parliament...|    40341252|Coexistence_(poli...|
|  5|  0|    Democratic Party|2728176-1|Slovak parliament...|     1973976|Democratic_Party_...|
|  6|  0|     Party of Greens|2728176-1|Slovak parliament...|    47633978|     Party_of_Greens|
|  9|  0|       Freedom Party|2728176-1|

In [9]:
# data for wikiGS
wikipedia_gs_entity_mentions = spark.createDataFrame(sc.textFile('../../data/entity_linking/WikipediaGS_json/entities_instance').map(json.loads)\
        .flatMap(lambda x:[Row(i=z[2],tableId=unquote(x['tableId']),is_gs=1,table_pgTitle=unquote(x['url']).split('/')[-1].replace('_',' '),wikipedia_title=unquote(z[0]).replace('http://dbpedia.org/resource/',''),mention=z[1]) for z in x['mappings']]))

In [41]:
wikipedia_gs_entity_mentions.show()

+---+-----+--------------------+--------------------+-------------------+--------------------+
|  i|is_gs|             mention|             tableId|      table_pgTitle|     wikipedia_title|
+---+-----+--------------------+--------------------+-------------------+--------------------+
|  1|    1|           Tapeheads|https://en.wikipe...|"Weird Al" Yankovic|           Tapeheads|
|  2|    1|The Naked Gun: Fr...|https://en.wikipe...|"Weird Al" Yankovic|The_Naked_Gun:_Fr...|
|  3|    1|                 UHF|https://en.wikipe...|"Weird Al" Yankovic|          UHF_(film)|
|  4|    1|The Naked Gun 2Â½...|https://en.wikipe...|"Weird Al" Yankovic|The_Naked_Gun_2½:...|
|  6|    1|            Spy Hard|https://en.wikipe...|"Weird Al" Yankovic|            Spy_Hard|
|  7|    1|       Safety Patrol|https://en.wikipe...|"Weird Al" Yankovic|Safety_Patrol_(film)|
| 10|    1|  Haunted Lighthouse|https://en.wikipe...|"Weird Al" Yankovic|  Haunted_Lighthouse|
| 11|    1|        Halloween II|https://en.wikipe.

In [10]:
def get_gs_mentions(table):
    tableId=unquote(table['tableId'])
    results = []
    rows = table.get('contents', {})
    for i,row in enumerate(rows):
        for j,cell in enumerate(row):
            if 'wikiPageId' in cell:
                results.append(Row(
                    tableId=tableId,
                    i=i,j=j,
                    mention=cell['data'],
                    wikipedia_title=unquote(cell['wikiPageId'])
                ))
    return results

In [21]:
def get_gs_context(table):
    x = {}
    x['tableId'] = unquote(table['tableId'])
    x['pgTitle'] = table.get('title','')
    if x['pgTitle'] is None:
        x['pgTitle'] = ''
    else:
        x['pgTitle'] = x['pgTitle'].replace('- Wikipedia, the free encyclopedia','')
    x['sectionTitle'] = ''
    x['tableCaption'] = table.get('context','')
    if x['tableCaption'] is None:
        x['tableCaption'] = ''
    x['tableCaption'] = x['tableCaption'].replace('[edit]','')
    headers = []
    for i,row in enumerate(table['contents']):
        for j,cell in enumerate(row):
            if len(headers)<=j:
                headers.append([''])
            if cell.get('isHeader',False):
                headers[j].append(cell['data'])
    x['processed_tableHeaders'] = [' '.join(h) for h in headers]
    return x

In [11]:
wikipedia_gs_tables = sc.textFile('../../data/entity_linking/WikipediaGS_json/tables_instance').map(json.loads)

In [12]:
wikipedia_gs_raw_mentions = spark.createDataFrame(wikipedia_gs_tables.flatMap(get_gs_mentions))

In [22]:
wikipedia_gs_tables = wikipedia_gs_tables.map(get_gs_context)

In [88]:
wikipedia_gs_tables.take(10)

[{'tableId': 'https://en.wikipedia.org/wiki/"Weird_Al"_Yankovic#0',
  'pgTitle': '"Weird Al" Yankovic ',
  'sectionTitle': '',
  'tableCaption': 'Film',
  'processed_tableHeaders': [' Year', ' Title', ' Role', ' Notes']},
 {'tableId': 'https://en.wikipedia.org/wiki/Ángel_Berlanga#0',
  'pgTitle': 'Ángel Berlanga ',
  'sectionTitle': '',
  'tableCaption': 'Statistics accurate as of 21 March 2014',
  'processed_tableHeaders': [' Club Club Career total',
   ' Season Season 56',
   ' League Apps 4',
   ' League Cup Goals 0',
   ' Other Domestic Cup Apps 0',
   ' International Goals 0',
   ' Total Apps 0',
   ' Goals 16',
   ' Apps 1',
   ' Goals 72',
   ' Apps 5',
   ' Goals']},
 {'tableId': 'https://en.wikipedia.org/wiki/Åge_Hareide#0',
  'pgTitle': 'Åge Hareide ',
  'sectionTitle': '',
  'tableCaption': 'Playing career',
  'processed_tableHeaders': [' Club performance[20] Season Norway England Norway Total Total Career total',
   ' League Club League League League Norway England 266',
  

In [45]:
wikipedia_gs_raw_mentions.show()

+---+---+--------------------+--------------------+--------------------+
|  i|  j|             mention|            table_id|     wikipedia_title|
+---+---+--------------------+--------------------+--------------------+
|  1|  1|           Tapeheads|https://en.wikipe...|           Tapeheads|
|  2|  1|The Naked Gun: Fr...|https://en.wikipe...|The_Naked_Gun:_Fr...|
|  3|  1|                 UHF|https://en.wikipe...|          UHF_(film)|
|  4|  1|The Naked Gun 2Â½...|https://en.wikipe...|The_Naked_Gun_2½:...|
|  6|  1|            Spy Hard|https://en.wikipe...|            Spy_Hard|
|  7|  1|       Safety Patrol|https://en.wikipe...|Safety_Patrol_(film)|
| 10|  1|  Haunted Lighthouse|https://en.wikipe...|  Haunted_Lighthouse|
| 11|  1|        Halloween II|https://en.wikipe...|Halloween_II_(200...|
| 12|  1|    Batman vs. Robin|https://en.wikipe...|    Batman_vs._Robin|
| 12|  2|       The Dollmaker|https://en.wikipe...|  Dollmaker_(comics)|
|  2|  0|       Auckland City|https://en.wikipe...|

In [13]:
wikipedia_gs_entity_mentions = wikipedia_gs_entity_mentions.join(wikipedia_gs_raw_mentions,['i','tableId','mention','wikipedia_title'],'inner')

In [66]:
print(train_mentions.count())
train_mentions = train_mentions.join(wikipedia_gs_entity_mentions.select('table_pgTitle','is_gs').dropDuplicates(),'table_pgTitle','left').where(F.isnull('is_gs'))
print(train_mentions.count())
print(val_mentions.count())
val_mentions = val_mentions.join(wikipedia_gs_entity_mentions.select('table_pgTitle','is_gs').dropDuplicates(),'table_pgTitle','left').where(F.isnull('is_gs'))
print(val_mentions.count())

10677313
7288339
287663
189923


In [107]:
print(wikipedia_gs_entity_mentions.select('wikipedia_title').dropDuplicates().count())
print(dbpedia_types.select('wikipedia_title').dropDuplicates().count())
print(wikipedia_gs_entity_mentions.select('wikipedia_title').join(dbpedia_types,'wikipedia_title','inner').select('wikipedia_title').dropDuplicates().count())
print(wikipedia_gs_entity_mentions.select('wikipedia_title').join(dbpedia_abstract,'wikipedia_title','inner').select('wikipedia_title').dropDuplicates().count())

1222358
3829240
616491
967614


In [115]:
print(wikipedia_gs_entity_mentions.count())
print(wikipedia_gs_entity_mentions.select('mention').dropDuplicates().count())
wikipedia_gs_entity_mentions.show()

4453329
1490691
+--------------------+--------------------+
|             mention|     wikipedia_title|
+--------------------+--------------------+
|           Tapeheads|           Tapeheads|
|The Naked Gun: Fr...|The_Naked_Gun:_Fr...|
|                 UHF|          UHF_(film)|
|The Naked Gun 2Â½...|The_Naked_Gun_2½:...|
|            Spy Hard|            Spy_Hard|
|       Safety Patrol|Safety_Patrol_(film)|
|  Haunted Lighthouse|  Haunted_Lighthouse|
|        Halloween II|Halloween_II_(200...|
|    Batman vs. Robin|    Batman_vs._Robin|
|       Auckland City|    Auckland_City_FC|
|        Sporting Goa|Sporting_Clube_de...|
|       Auckland City|    Auckland_City_FC|
|               HÃ¸dd|             IL_Hødd|
|               Molde|            Molde_FK|
|     Manchester City|Manchester_City_F.C.|
|        Norwich City|   Norwich_City_F.C.|
|               Molde|            Molde_FK|
|            BjorÃ¸yl|             Bjorøyl|
|          Norlandair|          Norlandair|
|            Ish

In [6]:
entity_mentions = sc.textFile('../../data/entity_linking/tableMentions.json').map(json.loads)
display(entity_mentions.take(1))
display(entity_mentions.count())

[{'_id': {'$oid': '535891dca3103b9804b3bd01'},
  'candidates': [],
  'cellCol': 4,
  'cellRow': 0,
  'endOffset': 11,
  'goldAnnotation': {'titleId': 278728, 'title': 'Maddah'},
  'isTest': False,
  'order': 0.7678070396650583,
  'pgId': 2204,
  'pgTitle': 'Arabic alphabet',
  'startOffset': 0,
  'surfaceForm': 'alif maddah',
  'tableId': 6}]

30478955

In [7]:
entity_mentions_surface = entity_mentions.map(lambda x:(x['surfaceForm'])).distinct()
print(entity_mentions_surface.count())

2759237


In [8]:
entity_mentions_surface = wikipedia_gs_entity_mentions.rdd.map(lambda x:x['mention']).distinct().collect()
print(len(entity_mentions_surface))

1490691


In [16]:
from google.cloud import language
from google.oauth2 import service_account
import urllib.parse
import urllib.request
from multiprocessing import Pool
import time

In [347]:
def wikidata_lookup(query):
    service_url = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&search={}&language=en&limit=50&format=json'
    url = service_url.format(urllib.parse.quote(query))
    for i in range(3):
        try:
            response = urllib.request.urlopen(url)
        except urllib.error.HTTPError as e:
            if e.code == 429 or e.code == 503:
                response = e.code
                time.sleep(1)
                continue
            else:
                response = e.code
                break
        except urllib.error.URLError as e:
            response = None
            break
        else:
            response = json.loads(response.read())
            break
#     if isinstance(response, dict):
#         response = [[z.get('id'),z.get('label'),z.get('description')] for z in response.get('search', [])]
    return [query, response]

In [348]:
wikidata_lookup("Michael Grant")

['Michael Grant',
 {'searchinfo': {'search': 'Michael Grant'},
  'search': [{'id': 'Q1351047',
    'title': 'Q1351047',
    'pageid': 1290671,
    'repository': 'wikidata',
    'url': '//www.wikidata.org/wiki/Q1351047',
    'concepturi': 'http://www.wikidata.org/entity/Q1351047',
    'label': 'Michael Grant',
    'description': 'Michael Grant; classical scholar, wrote histories of the ancient world; b. Nov. 21, 1914; d. Oct. 4 in London, aged 89',
    'match': {'type': 'label', 'language': 'en', 'text': 'Michael Grant'}},
   {'id': 'Q1752975',
    'title': 'Q1752975',
    'pageid': 1685711,
    'repository': 'wikidata',
    'url': '//www.wikidata.org/wiki/Q1752975',
    'concepturi': 'http://www.wikidata.org/entity/Q1752975',
    'label': 'Michael Grant',
    'description': 'author from the United States. science fiction writer',
    'match': {'type': 'label', 'language': 'en', 'text': 'Michael Grant'}},
   {'id': 'Q438561',
    'title': 'Q438561',
    'pageid': 413572,
    'repository

In [11]:
entity_wikidata_candidates = []

In [12]:
if entity_wikidata_candidates is not None:
    i = len(entity_wikidata_candidates)
else:
    entity_wikidata_candidates = []
    i = 0
pool = Pool(processes=16)
while i < len(entity_mentions_surface):
    print(i)
    tmp = list(tqdm(pool.imap(wikidata_lookup, entity_mentions_surface[i:i+10000], chunksize=150),total=10000))
    entity_wikidata_candidates.extend(tmp)
    i += 10000
pool.close()

0


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


20000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


30000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


40000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


50000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


60000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


70000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


80000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


90000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


100000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


110000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


120000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


130000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


140000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


150000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


160000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


170000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


180000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


190000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


200000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


210000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


220000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


230000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


240000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


250000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


260000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


270000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


280000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


290000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


300000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


310000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


320000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


330000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


340000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


350000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


360000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


370000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


380000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


390000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


400000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


410000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


420000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


430000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


440000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


450000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


460000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


470000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


480000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


490000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


500000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


510000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


520000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


530000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


540000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


550000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


560000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


570000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


580000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


590000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


600000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


610000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




1230000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1240000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1250000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1260000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1270000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1280000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1290000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1300000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1310000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1320000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1330000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1340000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1350000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1360000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1370000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1380000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1390000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1400000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1410000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1420000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1430000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1440000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1450000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1460000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1470000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1480000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


1490000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [13]:
len(entity_wikidata_candidates)

1490691

In [None]:
import re
entity_mentions_surface_normed_0 = [re.sub('^\W|\W$','',x) for x in entity_mentions_surface if re.sub('^\W|\W$','',x)!=x]
entity_wikidata_candidates_normed_0 = []
i = 0
pool = Pool(processes=16)
while i < len(entity_mentions_surface_normed_0):
    print(i)
    tmp = list(tqdm(pool.imap(wikidata_lookup, entity_mentions_surface_normed_0[i:i+10000], chunksize=300),total=10000))
    entity_wikidata_candidates_normed_0.extend(tmp)
    i += 10000
pool.close()

In [29]:
missing_wikidata_candidates = []
i = 0
pool = Pool(processes=16)
while i < len(missing_mentions):
    print(i)
    tmp = list(tqdm(pool.imap(wikidata_lookup, missing_mentions[i:i+10000], chunksize=300),total=10000))
    missing_wikidata_candidates.extend(tmp)
    i += 10000
pool.close()

0


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [None]:
len(entity_mentions_surface_normed_0)/len(entity_mentions_surface)

In [None]:
entity_wikidata_candidates_normed_0_dict = {x[0]:x[1] for x in entity_wikidata_candidates_normed_0 if (isinstance(x[1],list) and len(x[1])!=0)}

In [None]:
for i, x in enumerate(entity_wikidata_candidates):
    processed = re.sub('^\W|\W$','',x[0])
    if processed != x[0] and processed in entity_wikidata_candidates_normed_0_dict:
        entity_wikidata_candidates[i][1] += entity_wikidata_candidates_normed_0_dict[processed]

In [22]:
entity_wikidata_target = spark.createDataFrame(entity_mentions.map(\
                                lambda x:Row(id=x['_id']['$oid'],mention=x['surfaceForm'], wikipedia_id=x['goldAnnotation']['titleId'], cell_id='%d_%d_%d_%d'%(x['pgId'],x['tableId'],x['cellRow'],x['cellCol']))))\
                                .join(wikipedia_wikidata_mapping,'wikipedia_id','inner')\
                                .join(wiki_mid_mapping,'wikipedia_id', 'inner')

In [148]:
entity_wikidata_target = wikipedia_gs_entity_mentions.join(wikipedia_wikidata_mapping,'wikipedia_title','left')\
                                                    .join(wiki_mid_mapping,'wikipedia_id', 'left')

In [116]:
entity_wikidata_target.show()

+------------+--------------------+--------------------+-----------+------------+
|wikipedia_id|     wikipedia_title|             mention|wikidata_id|freebase_mid|
+------------+--------------------+--------------------+-----------+------------+
|        1677|Alfonso_XIII_of_S...|Alfonso XIII of S...|     Q18363|      m.0s2v|
|        1677|Alfonso_XIII_of_S...|Alfonso XIII, the...|     Q18363|      m.0s2v|
|        1677|Alfonso_XIII_of_S...|Alfonso XIII of S...|     Q18363|      m.0s2v|
|        1677|Alfonso_XIII_of_S...|Alfonso XIII of S...|     Q18363|      m.0s2v|
|        1677|Alfonso_XIII_of_S...|Alfonso XIII of S...|     Q18363|      m.0s2v|
|        1806|Arnold_Schwarzene...|Arnold Schwarzene...|      Q2685|      m.0tc7|
|        1806|Arnold_Schwarzene...|Schwarzenegger (R...|      Q2685|      m.0tc7|
|        1806|Arnold_Schwarzene...|Schwarzenegger (R...|      Q2685|      m.0tc7|
|        1806|Arnold_Schwarzene...|Arnold Schwarzene...|      Q2685|      m.0tc7|
|        1806|Ar

In [140]:
import os
def load_entity_vocab(data_dir, ignore_bad_title=True, min_ent_count=1):
    entity_vocab = {}
    bad_title = 0
    few_entity = 0
    with open(os.path.join(data_dir, 'entity_vocab.txt'), 'r', encoding="utf-8") as f:
        for line in f:
            _, entity_id, entity_title, entity_mid, count = line.strip().split('\t')
            if ignore_bad_title and entity_title == '':
                bad_title += 1
            elif int(count) < min_ent_count:
                few_entity += 1
            else:
                entity_vocab[len(entity_vocab)] = {
                    'wiki_id': int(entity_id),
                    'wiki_title': entity_title,
                    'mid': entity_mid,
                    'count': int(count)
                }
    print('total number of entity: %d\nremove because of empty title: %d\nremove because count<%d: %d'%(len(entity_vocab),bad_title,min_ent_count,few_entity))
    return entity_vocab

data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
entity_vocab = load_entity_vocab(data_dir, True, 2)
train_all_entities = set([x['mid'] for _,x in entity_vocab.items() if x['mid']!=''])
train_all_entities_wiki_id = set([x['wiki_id'] for _,x in entity_vocab.items()])

total number of entity: 926131
remove because of empty title: 14206
remove because count<2: 847401


In [14]:
# with open('wikipedia_gs_wikidata_candidates.json', "w", encoding='utf8') as f:
#     json.dump(entity_wikidata_candidates, f)
with open('wikipedia_gs_wikidata_candidates.json', "r", encoding='utf8') as f:
    entity_wikidata_candidates = json.load(f)

In [404]:
# with open('wikidata_candidates.json', "w", encoding='utf8') as f:
#     json.dump(entity_wikidata_candidates, f)
with open('wikidata_candidates.json', "r", encoding='utf8') as f:
    entity_wikidata_candidates = json.load(f)

In [30]:
entity_wikidata_candidates += missing_wikidata_candidates

In [406]:
entity_wikidata_candidates_df = spark.createDataFrame(sc.parallelize(entity_wikidata_candidates).map(\
                                                lambda x:Row(mention=x[0], candidates=x[1] if isinstance(x[1],list) else [])
                                                ))

In [17]:
wikipedia_gs_entity_mentions_with_candidate = wikipedia_gs_entity_mentions.join(entity_wikidata_candidates_df, "mention", 'left')\
                                            .join(wikipedia_wikidata_mapping,'wikipedia_title','left')

In [407]:
train_mentions_with_candidate = train_mentions.join(entity_wikidata_candidates_df, "mention", 'left')\
                                            .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')
val_mentions_with_candidate = val_mentions.join(entity_wikidata_candidates_df, "mention", 'left')\
                                            .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')
test_mentions_with_candidate = test_mentions.join(entity_wikidata_candidates_df, "mention", 'left')\
                                            .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')

In [20]:
val_mentions_with_candidate.show()

+--------------------+--------------------+---+---+----------+--------------------+------------+--------------------+------------+-----------+
|     wikipedia_title|             mention|  i|  j|  table_id|       table_pgTitle|wikipedia_id|          candidates|wikipedia_id|wikidata_id|
+--------------------+--------------------+---+---+----------+--------------------+------------+--------------------+------------+-----------+
|    (14415)_1991_RQ7|               14415| 14|  0| 9004474-1|List of minor pla...|    32821609|                  []|    32821609|   Q1417966|
|129th_Duke_of_Con...|129th Duke of Con...|335|  1|  750087-1|List of First Wor...|    17141447|[[Q4548643, 129th...|    17141447|   Q4548643|
|          13954_Born|          13954 Born| 53|  0| 9006017-1|List of minor pla...|    16426563|[[Q2623111, 13954...|    16426563|   Q1829378|
|        1944_in_film|                1944| 11|  0| 3419262-1|         Gloria Jean|      172410|[[Q5268, 1944, ye...|      172410|    Q738597|

In [25]:
missing_mentions = val_mentions_with_candidate.where(F.isnull('candidates')).rdd.map(lambda x:x['mention']).collect()
missing_mentions += val_mentions_with_candidate.where(F.isnull('candidates')).rdd.map(lambda x:x['mention']).collect()
missing_mentions = list(set(missing_mentions))
print(len(missing_mentions))

2199


In [26]:
train_mentions_with_candidate.dropDuplicates(['mention','wikidata_id']).where(F.size('candidates')!=0).count()

1550534

In [149]:
entity_wikidata_target_candidate = entity_wikidata_target.join(entity_wikidata_candidates_df, "mention", 'inner')

In [19]:
print(train_mentions_with_candidate.where(F.size('candidates')!=0).count())
print(train_mentions_with_candidate.where(F.size('candidates')!=0).join(dbpedia_types.select('wikipedia_title').dropDuplicates(),'wikipedia_title','inner').count())
print(train_mentions_with_candidate.where(F.size('candidates')!=0).join(dbpedia_abstract.select('wikipedia_title').dropDuplicates(),'wikipedia_title','inner').count())

6853976
3913687
5678815


In [21]:
train_mentions_with_candidate.show()

+--------------------+--------------------+--------------------+---+---+----------+------------+-----+--------------------+------------+-----------+
|     wikipedia_title|             mention|       table_pgTitle|  i|  j|  table_id|wikipedia_id|is_gs|          candidates|wikipedia_id|wikidata_id|
+--------------------+--------------------+--------------------+---+---+----------+------------+-----+--------------------+------------+-----------+
|  'N_Beetje_Verliefd|  'N Beetje Verliefd|       Ad van Kempen|  1|  1|37275079-1|    13837343| null|[[Q2242962, 'N Be...|    13837343|   Q2242962|
|   (100132)_1993_RR8|              100132|List of minor pla...| 31|  0| 3067641-1|    32708188| null|                  []|    32708188|   Q2366578|
|    (18389)_1992_JU2|               18389|List of minor pla...| 88|  0| 8698700-1|    32568869| null|                  []|    32568869|   Q1829383|
|  (24978)_1998_HJ151|          1998 HJ151|List of trans-Nep...| 21|  2|  551374-2|     8673196| null|[[Q1

In [412]:
def build_for_own(x):
    all_processed = []
    table_id = x[0]
    pgTitle = x[1][1][0]
    secTitle = x[1][1][1]
    caption = x[1][1][2]
    headers = x[1][1][3]
    all_entities = x[1][0]
    while len(all_entities)>0:
        entities = [[[z[0],z[1]],z[2]] for z in all_entities[:50]]
        candidate_entities = {}
        for z in all_entities[:50]:
            for cand in z[4]:
                if cand[0] not in candidate_entities:
                    candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
        labels = [candidate_entities[z[3]][0]  for z in all_entities[:50]]
        cand_for_each = [[candidate_entities[cand[0]][0] for cand in z[4]] for z in all_entities[:50]]
        tmp_candidate_entities = [0]*len(candidate_entities)
        for k,v in candidate_entities.items():
            tmp_candidate_entities[v[0]] = v[1:]
        all_processed.append([table_id, pgTitle, secTitle, caption, headers, entities, tmp_candidate_entities, labels, cand_for_each])
        all_entities = all_entities[50:]
    return all_processed

In [413]:
def build_for_own_with_wikidata_id(x):
    all_processed = []
    table_id = x[0]
    pgTitle = x[1][1][0]
    secTitle = x[1][1][1]
    caption = x[1][1][2]
    headers = x[1][1][3]
    all_entities = x[1][0]
    while len(all_entities)>0:
        entities = [[[z[0],z[1]],z[2]] for z in all_entities[:50]]
        candidate_entities = {}
        for z in all_entities[:50]:
            for cand in z[4]:
                if cand[0] not in candidate_entities:
                    candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[]),cand[0]]
        labels = [candidate_entities[z[3]][0]  for z in all_entities[:50]]
        cand_for_each = [[candidate_entities[cand[0]][0] for cand in z[4]] for z in all_entities[:50]]
        tmp_candidate_entities = [0]*len(candidate_entities)
        for k,v in candidate_entities.items():
            tmp_candidate_entities[v[0]] = v[1:]
        all_processed.append([table_id, pgTitle, secTitle, caption, headers, entities, tmp_candidate_entities, labels, cand_for_each])
        all_entities = all_entities[50:]
    return all_processed

In [96]:
# only output examples with recall>0 for reranking. Including empty candidates or all wrong candidates
train_mentions_local = train_mentions_with_candidate.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .dropDuplicates(['mention','wikidata_id'])\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(train_tables.map(lambda x:(
                    x['_id'],\
                    [x['pgTitle'],x['sectionTitle'],x['tableCaption'],x['processed_tableHeaders']]
                    ))).flatMap(build_for_own).collect()

In [97]:
val_mentions_local = val_mentions_with_candidate.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .dropDuplicates(['mention','wikidata_id']).where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(val_tables.map(lambda x:(
                    x['_id'],\
                    [x['pgTitle'],x['sectionTitle'],x['tableCaption'],x['processed_tableHeaders']]
                    ))).flatMap(build_for_own).collect()

In [414]:
#08/20
test_mentions_local = test_mentions_with_candidate.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(test_tables.map(lambda x:(
                    x['_id'],\
                    [x['pgTitle'],x['sectionTitle'],x['tableCaption'],x['processed_tableHeaders']]
                    ))).flatMap(build_for_own).collect()
print(len(test_mentions_local))
test_mentions_local_with_wikidata_id = test_mentions_with_candidate.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(test_tables.map(lambda x:(
                    x['_id'],\
                    [x['pgTitle'],x['sectionTitle'],x['tableCaption'],x['processed_tableHeaders']]
                    ))).flatMap(build_for_own_with_wikidata_id).collect()
print(len(test_mentions_local_with_wikidata_id))

7291
7291


In [68]:
print(sc.parallelize(test_mentions_local).map(lambda x:x[0]).distinct().count())
print(sc.parallelize(test_mentions_local).map(lambda x:len(x[5])).sum())

4920
225798


In [73]:
print(len(train_mentions_local))
print(len(val_mentions_local))

194045
3236


In [98]:
with open(data_dir+'train.table_entity_linking.json','w') as f:
    json.dump(train_mentions_local,f)
with open(data_dir+'dev.table_entity_linking.json','w') as f:
    json.dump(val_mentions_local,f)

In [45]:
with open(data_dir+'test_own.table_entity_linking.json','w') as f:
    json.dump(test_mentions_local,f)

In [415]:
with open(data_dir+'test_own_0820.table_entity_linking.with_wikidata_id.json','w') as f:
    json.dump(test_mentions_local_with_wikidata_id,f)
with open(data_dir+'test_own_0820.table_entity_linking.json','w') as f:
    json.dump(test_mentions_local,f)

In [127]:
with open(data_dir+'train.table_entity_linking.json','r') as f:
    train_mentions_local = sc.parallelize(json.load(f))

In [48]:
test_mentions_local[0]

['23235546-1',
 'Ivan Lendl career statistics',
 'Singles: 19 finals (8 titles, 11 runner-ups)',
 '',
 ['outcome',
  'year',
  'championship',
  'surface',
  'opponent in the final',
  'score in the final'],
 [[[0, 4], 'Björn Borg'],
  [[9, 2], 'Wimbledon'],
  [[0, 2], 'French Open'],
  [[10, 4], 'Miloslav Mečíř'],
  [[1, 2], 'US Open'],
  [[3, 2], 'Australian Open'],
  [[3, 4], 'Mats Wilander'],
  [[4, 4], 'John McEnroe'],
  [[9, 4], 'Boris Becker'],
  [[8, 4], 'Mikael Pernfors'],
  [[17, 4], 'Stefan Edberg'],
  [[1, 4], 'Jimmy Connors'],
  [[12, 4], 'Pat Cash']],
 [['Björn Borg', 'Swedish tennis player', []],
  ['Björn Borg', 'Swedish swimmer', ['Swimmer']],
  ['Björn Borg', None, []],
  ['Björn Borg', 'Wikimedia disambiguation page', []],
  ['Björn Borg', 'Finnish writer', []],
  ['Bjørn Borgen', 'Norwegian association football player', []],
  ['Björn Borg career statistics', None, []],
  ['Bjørn Borge', 'Norwegian musician and conductor', []],
  ['Björn Borgmann', 'German painter',

In [25]:
test_mentions_local = wikipedia_gs_entity_mentions_with_candidate.select('tableId','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['tableId'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(wikipedia_gs_tables.map(lambda x:(
                    x['tableId'],\
                    [x['pgTitle'],x['sectionTitle'],x['tableCaption'],x['processed_tableHeaders']]
                    ))).flatMap(build_for_own).collect()

In [96]:
data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
with open(data_dir+'test.table_entity_linking.json','w') as f:
    json.dump(test_mentions_local,f)

In [81]:
def get_labels_and_candidate(tables):
    results = []
    for i,entity in enumerate(tables[5]):
        results.append(((tables[0],entity[0][0],entity[0][1]),[tables[7][i],tables[8][i]]))
    return results

# Evaluation with dumped model results

In [39]:
import pickle
with open("/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/test_entity_linking_results_2.pkl","rb") as f:
    gs_test_results = pickle.load(f)

In [63]:
import pickle
with open("/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/test_own_entity_linking_results_2.pkl","rb") as f:
    test_results = pickle.load(f)

In [419]:
import pickle
with open("/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/test_own_0820_entity_linking_results_0.pkl","rb") as f:
    test_results = pickle.load(f)

In [416]:
def get_tp(result):
    result = result[1]
    for x in result[1]:
        if x in result[0][1]:
            if x==result[0][0]:
                return 1
            else:
                return 0
    return 0

In [420]:
our_tp = sc.parallelize(test_mentions_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),x[2][i]) for i,z in enumerate(x[1])]))\
    .map(get_tp).sum()

In [77]:
mentioned_dbpedia_types = sc.parallelize(train_mentions_local).map(lambda x:set([z for y in x[6] for z in y[2]])).reduce(lambda a,b:a|b)

In [80]:
with open(data_dir+"dbpedia_type_vocab.txt", "w") as f:
    f.write('{}\t{}\n'.format(0,'[PAD]'))
    for i,t in enumerate(mentioned_dbpedia_types):
        f.write('{}\t{}\n'.format(i+1,t))

In [150]:
wrong_mentions = spark.createDataFrame(entity_wikidata_target_candidate.rdd.filter(lambda x:x['wikidata_id'] not in [z[0] for z in x['candidates'][:1]] and x['wikidata_id'] in [z[0] for z in x['candidates'][:]]))

In [141]:
print(wrong_mentions.where('wikipedia_id is not null').distinct().count())
entities = set(wrong_mentions.where('wikipedia_id is not null').rdd.map(lambda x:x['wikipedia_id']).distinct().collect())
print(len(entities))
print(len(entities&train_all_entities_wiki_id))

185395
181802
81854


In [139]:
print(wrong_mentions.where(F.size('candidates')!=0).count())
print(wrong_mentions.where(F.size('candidates')!=0).join(dbpedia_types.select('wikipedia_title').dropDuplicates(),'wikipedia_title','inner').count())
print(wrong_mentions.where(F.size('candidates')!=0).join(dbpedia_abstract.select('wikipedia_title').dropDuplicates(),'wikipedia_title','inner').count())

672354
394656
575000


In [151]:
wrong_mentions.where(F.size('candidates')!=0).join(dbpedia_types,'wikipedia_title','left').where('type is null').show()

+--------------------+--------------------+------------+--------------------+-----------+------------+--------------------+----+
|     wikipedia_title|             mention|wikipedia_id|       table_pgTitle|wikidata_id|freebase_mid|          candidates|type|
+--------------------+--------------------+------------+--------------------+-----------+------------+--------------------+----+
|1959–60_Minneapol...|  Minneapolis Lakers|    33347427|NBA Conference Fi...|    Q568978|   m.0h7llw9|[[Q121783, Los An...|null|
|2003_World_Series...|World Series by N...|    49451927|Bas Leinders - Wi...|   Q2425538|        null|[[Q1465453, World...|null|
|2007_China_Open_S...|China Open Super ...|    14313553|Koo Kien Keat - W...|    Q568542|   m.03d06yw|[[Q1073355, China...|null|
|2007_China_Open_S...|China Open Super ...|    14313553|Wong Mew Choo - W...|    Q568542|   m.03d06yw|[[Q1073355, China...|null|
|2008_Formula_3_Eu...|Formula 3 Euro Se...|    49441477|Nico Hülkenberg -...|    Q651980|        

In [88]:
def get_index(x,cands):
    for i,z in enumerate(cands):
        if x==z:
            return i
    return 999
best_recall = val_mentions_with_candidate.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .dropDuplicates(['mention','wikidata_id']).where(~F.isnull('candidates')).rdd.filter(lambda x:len(x['candidates'])!=0).map(lambda x:get_index(x['wikidata_id'],[z[0] for z in x['candidates']])).collect()

In [86]:
numpy_describe(best_recall)

count 1550534
min 0
max 999
mean 185.64289399651992
std 387.10171744575104
10% 0.0
25% 0.0
50% 0.0
60% 0.0
75% 4.0
80% 21.0
90% 999.0


In [89]:
numpy_describe(best_recall)

count 93191
min 0
max 999
mean 177.84304278310137
std 380.5128418675207
10% 0.0
25% 0.0
50% 0.0
60% 0.0
75% 4.0
80% 17.0
90% 999.0


In [100]:
for i in range(60,80):
    print(i,np.percentile(best_recall,i))

60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 1.0
66 1.0
67 1.0
68 1.0
69 1.0
70 1.0
71 2.0
72 2.0
73 3.0
74 3.0
75 4.0
76 5.0
77 6.0
78 8.0
79 11.0


In [53]:
wikipedia_gs_entity_mentions.count()

4475523

In [110]:
gs_wikidata_P

4453329

In [115]:
gs_wikidata_TP

2193152

In [118]:
our_tp

2582097

In [89]:
test_mentions_with_candidate.count()

297018

In [417]:
test_wikidata_all_predicted = test_mentions_with_candidate.where(F.size('candidates')>=1).count()
test_wikidata_TP = test_mentions_with_candidate.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates'][:1]] else 0).sum()
test_wikidata_P = test_mentions_with_candidate.count()
test_wikidata_best_TP = test_mentions_with_candidate.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates']] else 0).sum()

In [50]:
precision = test_wikidata_TP/test_wikidata_all_predicted
recall = test_wikidata_TP/test_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.6190818842959042 0.6432029497797102 0.5967045768269936


In [51]:
precision = test_wikidata_best_TP/test_wikidata_all_predicted
recall = test_wikidata_best_TP/test_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.7887258018317603 0.8194566424480849 0.7602165525321698


In [58]:
precision = our_tp/test_wikidata_all_predicted
recall = our_tp/test_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.6838501896731194 0.7104947994164313 0.6591317697917298


In [62]:
print('no description')
precision = our_tp/test_wikidata_all_predicted
recall = our_tp/test_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

no description
0.6025282763149621 0.626004369506362 0.5807493148563386


In [66]:
print('no type')
precision = our_tp/test_wikidata_all_predicted
recall = our_tp/test_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

no type
0.671177370564688 0.6973282138009624 0.6469170218639948


In [27]:
gs_wikidata_all_predicted/gs_wikidata_P

0.7332761626190205

In [32]:
gs_wikidata_all_predicted = wikipedia_gs_entity_mentions_with_candidate.where(F.size('candidates')>=1).count()
gs_wikidata_TP = wikipedia_gs_entity_mentions_with_candidate.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates'][:1]] else 0).sum()
gs_wikidata_P = wikipedia_gs_entity_mentions_with_candidate.count()
gs_wikidata_best_TP = wikipedia_gs_entity_mentions_with_candidate.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates']] else 0).sum()

In [121]:
precision = gs_wikidata_TP/gs_wikidata_all_predicted
recall = gs_wikidata_TP/gs_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.5684905968962586 0.6716738275849047 0.4927879937160417


In [125]:
precision = gs_wikidata_best_TP/gs_wikidata_all_predicted
recall = gs_wikidata_best_TP/gs_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.7426943401975001 0.8774962205700879 0.6437940325633451


In [122]:
precision = our_tp/gs_wikidata_all_predicted
recall = our_tp/gs_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.6655672245974301 0.7863702367305769 0.5769374886465782


In [38]:
print('no description')
precision = our_tp/gs_wikidata_all_predicted
recall = our_tp/gs_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

no description
0.596581015266358 0.7048627649712325 0.5171377736188597


In [41]:
print('no type')
precision = our_tp/gs_wikidata_all_predicted
recall = our_tp/gs_wikidata_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

no type
0.6609115221932683 0.7808695064265536 0.5729017591910487


In [126]:
our_tp/gs_wikidata_best_TP

0.8961522776926507

In [24]:
entity_wikidata_target_candidate.where(F.size('candidates')>=1).count()

3265520

In [None]:
len(recall)/entity_wikidata_target_candidate.count()

In [None]:
wiki_35k_test = spark.createDataFrame(
    sc.textFile('../../data/entity_linking/35k_test.ids.txt')\
    .map(lambda x:Row(id=x))
)

In [None]:
wiki_35k_test_recall = entity_wikidata_target_candidate.join(wiki_35k_test,'id','inner')\
    .rdd.map(lambda x:x['wikidata_id'] in [z[0] for z in x['candidates']]).collect()
wiki_35k_test_precision = entity_wikidata_target_candidate.join(wiki_35k_test,'id','inner')\
    .rdd.map(lambda x:x['wikidata_id'] in [z[0] for z in x['candidates'][:1]]).collect()

In [None]:
sum(wiki_35k_test_recall)/len(wiki_35k_test_recall)

In [None]:
sum(wiki_35k_test_precision)/sum(wiki_35k_test_recall)

In [None]:
wiki_35k_mentions = entity_wikidata_target.join(wiki_35k_test,'id','inner')

In [41]:
entity_wikidata_target_candidate = entity_wikidata_target\
            .join(entity_googlekg_candidates_df, "mention", 'inner')\
            .join(entity_wikidata_candidates_df, "mention", 'inner')

In [44]:
entity_wikidata_target_candidate.count()

1055751

In [45]:
len(entity_mentions_surface)

2759237

# Efthymiou
## T2D

In [81]:
import csv

In [92]:
t2d_tables = sc.wholeTextFiles('../../data/efthymiou/t2d/tables_instance_with_context').map(lambda x:(x[0].split('/')[-1][:-5],json.loads(x[1])))

In [93]:
t2d_tables.take(1)

[('89511064_0_2199624509082573904',
  {'relation': [['Continent',
     'Asia',
     'Europe',
     'Africa',
     'Oceania',
     'Europe',
     'Africa',
     'Americas',
     'Americas',
     'Americas',
     'Asia',
     'Americas',
     'Oceania',
     'Europe',
     'Asia',
     'Americas',
     'Asia',
     'Asia',
     'Americas',
     'Europe',
     'Europe',
     'Americas',
     'Africa',
     'Americas',
     'Asia',
     'Americas',
     'Europe',
     'Africa',
     'Americas',
     'Americas',
     'Asia',
     'Europe',
     'Africa',
     'Africa',
     'Asia',
     'Africa',
     'Americas',
     'Africa',
     'Americas',
     'Africa',
     'Africa',
     'Americas',
     'Asia',
     'Asia',
     'Asia',
     'Americas',
     'Africa',
     'Africa',
     'Oceania',
     'Americas',
     'Africa',
     'Europe',
     'Americas',
     'Asia',
     'Europe',
     'Europe',
     'Africa',
     'Americas',
     'Americas',
     'Americas',
     'Africa',
     'Americas'

In [122]:
sc.wholeTextFiles('../../data/efthymiou/t2d/entities_instance')\
    .map(lambda x:(x[0].split('/')[-1][:-4],list(csv.reader(x[1].split('\n'))))).flatMap(lambda x:[y for y in x[1] if len(y)==3]).count()

26124

In [88]:
t2d_entities = spark.createDataFrame(sc.wholeTextFiles('../../data/efthymiou/t2d/entities_instance')\
    .map(lambda x:(x[0].split('/')[-1][:-4],list(csv.reader(x[1].split('\n')))))\
    .flatMap(lambda x:[Row(table_id=x[0], wikipedia_title=y[0].split('/')[-1],j=0,i=int(y[2]),mention=y[1].replace('&nbsp;','').replace('&nbsp','')) for y in x[1] if len(y)==3]))\
    .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')

In [89]:
t2d_entities.show()

+--------------------+---+---+--------------------+--------------------+------------+-----------+
|     wikipedia_title|  i|  j|             mention|            table_id|wikipedia_id|wikidata_id|
+--------------------+---+---+--------------------+--------------------+------------+-----------+
|  A_Passage_to_India| 64|  0|  a passage to india|6869358_0_1379459...|      113448|    Q622303|
|Anastasius_I_Dicorus|  7|  0|          anastasius|98312357_0_639541...|       25876|    Q173470|
|Anastasius_I_Dicorus| 88|  0|        anastasius i|51130304_0_303582...|       25876|    Q173470|
|Anastasius_I_Dicorus|130|  0|          anastasius|64207896_0_320670...|       25876|    Q173470|
|             Antwerp| 25|  0|             antwerp|29886325_0_144817...|    32149462|     Q12892|
|             Antwerp| 32|  0|             antwerp|75154185_0_160762...|    32149462|     Q12892|
|           Bangalore| 68|  0|           bangalore|91474256_0_964747...|    44275267|      Q1355|
|           Bangalor

In [118]:
t2d_entities.count()

25396

In [90]:
t2d_entity_mentions = list(set(t2d_entities.rdd.map(lambda x:x['mention']).collect()))
print(len(t2d_entity_mentions))

14991


In [87]:
t2d_entity_mentions[:100]

['red-tailed ant-thrush',
 'frontier airlines',
 'peeping tom',
 'gray catbird                *',
 'eiger',
 'tombstone',
 'broad-tailed hummingbird',
 'mary janes last dance&nbsp;',
 'esgeiriau       gwynion',
 'creep&nbsp;',
 'national art museum of the republic of belarus',
 'rock pratincole',
 'from dusk till dawn',
 'thunacar knott',
 'puerto rico (u.s.)',
 'piz platta',
 'dunlin',
 'fire-crested alethe',
 'elephants',
 'dreamhost',
 'willet',
 'neuroimage',
 'israel museum',
 'mississippi slimy salamander',
 'pope sixtus iii',
 'hartmut michel',
 'spider',
 'jet moto 2',
 'elymus lanceolatus',
 'landesmuseum für kunst und kulturgeschichte',
 'larry jackson',
 'flagler hospital',
 'pinus     resinosa',
 'goldman sachs group',
 'xena: warrior princess: the talisman of fate',
 'lake lucerne',
 'mad hot ballroom',
 "l'�ge d'or",
 'uniola paniculata',
 'need for speed: underground',
 'hoch fulen',
 '{super mario galaxy 2|[132]}',
 'nosferatu',
 'francis crick',
 'final fight 3',
 'com

In [91]:
entity_t2d_candidates = []
i = 0
pool = Pool(processes=16)
while i < len(t2d_entity_mentions):
    print(i)
    tmp = list(tqdm(pool.imap(wikidata_lookup, t2d_entity_mentions[i:i+10000], chunksize=150),total=10000))
    entity_t2d_candidates.extend(tmp)
    i += 10000
pool.close()

0


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [94]:
entity_t2d_candidates_df = spark.createDataFrame(sc.parallelize(entity_t2d_candidates).map(\
                                                lambda x:Row(mention=x[0], candidates=x[1] if isinstance(x[1],list) else [])
                                                ))

In [95]:
t2d_entities_with_candidates = t2d_entities.join(entity_t2d_candidates_df, 'mention','left')

In [96]:
t2d_entities_with_candidates.show()

+--------------------+--------------------+---+---+--------------------+------------+-----------+--------------------+
|             mention|     wikipedia_title|  i|  j|            table_id|wikipedia_id|wikidata_id|          candidates|
+--------------------+--------------------+---+---+--------------------+------------+-----------+--------------------+
|      archer's robin| Archer's_Robin-chat|310|  0|13719111_1_571940...|    36823518|   Q3728578|                  []|
|               aruba|               Aruba|  3|  0|68779923_4_183235...|         690|     Q21203|[[Q21203, Aruba, ...|
|               aruba|               Aruba| 10|  0|74491133_0_717783...|         690|     Q21203|[[Q21203, Aruba, ...|
|               aruba|               Aruba|  5|  0|12183399_0_672506...|         690|     Q21203|[[Q21203, Aruba, ...|
|               aruba|               Aruba| 11|  0|3917335_0_7791699...|         690|     Q21203|[[Q21203, Aruba, ...|
|               aruba|               Aruba|151| 

In [97]:
sample = t2d_entities_with_candidates.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(t2d_tables).take(1)[0]

In [99]:
sample[1][0]

[[3,
  0,
  'australia',
  'Q408',
  [['Q408', 'Australia', 'island country in the Southern hemisphere'],
   ['Q3960', 'Australia', "continent on the Earth's Southern Hemisphere"],
   ['Q781285', 'Australia', 'Wikimedia disambiguation page'],
   ['Q275180', 'Australia', '2008 film by Baz Luhrmann'],
   ['Q1189568', '8088 Australia', 'main-belt asteroid'],
   ['Q50776', 'Australian rules football', 'sport'],
   ['Q623578',
    'National Library of Australia',
    'national reference library in Canberra, Australia'],
   ['Q16835533', 'Australia', 'Racehorse'],
   ['Q205546', 'Australia', '1989 film by Jean-Jacques Andrien'],
   ['Q4823546', 'Australia', 'Manic Street Preachers song'],
   ['Q3258',
    'Australian Capital Territory',
    'federal territory of Australia, containing the capital city, Canberra'],
   ['Q781244', 'Australia', 'board game'],
   ['Q127990',
    'Australian National University',
    'national research university in Canberra, Australian Capital Territory, Australi

In [112]:
def build_for_own(x):
    all_processed = []
    table_id = x[0]
    pgTitle = x[1][1]['pageTitle']
    secTitle = ''
    caption = x[1][1]['title']
    header_i = x[1][1]['headerRowIndex']
    subject_j = x[1][1]['keyColumnIndex']
    headers = [column[header_i] for column in x[1][1]['relation'][subject_j:]]
    all_entities = x[1][0]
    total_num = len(all_entities)
    chunck_num = int(total_num/max([1,int(total_num/25)]))+1
    while len(all_entities)>0:
        entities = []
        candidate_entities = {}
        labels = []
        cand_for_each = []
        for e in all_entities[:chunck_num]:
            row_i = e[0]
            e_mention = e[2]
            entities.append([[row_i,0],e_mention])
            for cand in e[4]:
                if cand[0] not in candidate_entities:
                    candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
            labels.append(candidate_entities[e[3]][0])
            cand_for_each.append([candidate_entities[cand[0]][0] for cand in e[4]])
            for p,column in enumerate(x[1][1]['relation'][subject_j+1:subject_j+3]):
                if len(column)>row_i:
                    e_mention = column[row_i].replace('&nbsp;','').replace('&nbsp','')
                    entities.append([[row_i,p+1],e_mention])
                    labels.append(0)
                    cand_for_each.append([])
                    
#         entities = [[[z[0],0],z[2]] for z in all_entities[:50]]
#         candidate_entities = {}
#         for z in all_entities[:50]:
#             for cand in z[4]:
#                 if cand[0] not in candidate_entities:
#                     candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
#         labels = [candidate_entities[z[3]][0]  for z in all_entities[:50]]
#         cand_for_each = [[candidate_entities[cand[0]][0] for cand in z[4]] for z in all_entities[:50]]
        tmp_candidate_entities = [0]*len(candidate_entities)
        for k,v in candidate_entities.items():
            tmp_candidate_entities[v[0]] = v[1:]
        all_processed.append([table_id, pgTitle, secTitle, caption, headers, entities, tmp_candidate_entities, labels, cand_for_each])
        all_entities = all_entities[chunck_num:]
    return all_processed

In [110]:
build_for_own(sample)[1]

['24859353_0_7027810986004269522',
 'Rubicon:� World Currency Exchange Rates',
 '',
 '',
 ['ISO 4217',
  'CURRENCY',
  'EXCHANGE RATE (USD per Unit)',
  'CONVERTED AMOUNT (Units per USD)',
  ''],
 [[[51, 0], 'venezuela'],
  [[51, 1], 'VEB'],
  [[51, 2], 'Venezuelan Bolivar'],
  [[50, 0], 'united states'],
  [[50, 1], 'USD'],
  [[50, 2], 'United States Dollars'],
  [[39, 0], 'saudi arabia'],
  [[39, 1], 'SAR'],
  [[39, 2], 'Saudi Arabian Riyal'],
  [[40, 0], 'singapore'],
  [[40, 1], 'SGD'],
  [[40, 2], 'Singapore Dollars'],
  [[15, 0], 'egypt'],
  [[15, 1], 'EGP'],
  [[15, 2], 'Egyptian Pounds'],
  [[18, 0], 'great britain'],
  [[18, 1], 'GBP'],
  [[18, 2], 'United Kingdom Pounds'],
  [[41, 0], 'slovakia'],
  [[41, 1], 'SKK'],
  [[41, 2], 'Slovakian Koruna'],
  [[5, 0], 'barbados'],
  [[5, 1], 'BBD'],
  [[5, 2], 'Barbados Dollars'],
  [[30, 0], 'malaysia'],
  [[30, 1], 'MYR'],
  [[30, 2], 'Malaysian Ringgit'],
  [[38, 0], 'russia'],
  [[38, 1], 'RUR'],
  [[38, 2], 'Russian Rubles'],
  

In [113]:
t2d_local = t2d_entities_with_candidates.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(t2d_tables).flatMap(build_for_own).collect()

In [133]:
t2d_local[70][8]

[[0, 1, 2, 3, 4, 5],
 [],
 [],
 [6],
 [],
 [],
 [7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56],
 [],
 [],
 [57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106],
 [],
 [],
 [107, 108, 109],
 [],
 [],
 [110, 111, 112],
 [],
 [],
 [113],
 [],
 [],
 [114],
 [],
 [],
 [115],
 [],
 [],
 [116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,


In [134]:
def get_labels_and_candidate(tables):
    results = []
    for i,entity in enumerate(tables[5]):
        if len(tables[8][i])==0:
            continue
        results.append(((tables[0],entity[0][0],entity[0][1]),[tables[7][i],tables[8][i],tables[6]]))
    return results

In [124]:
data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
with open(data_dir+'t2d.table_entity_linking.json','w') as f:
    json.dump(t2d_local,f)

In [115]:
t2d_all_predicted = t2d_entities_with_candidates.where(F.size('candidates')>=1).count()
t2d_TP = t2d_entities_with_candidates.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates'][:1]] else 0).sum()
t2d_P = t2d_entities_with_candidates.count()
t2d_best_TP = t2d_entities_with_candidates.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates']] else 0).sum()

In [292]:
precision = t2d_TP/t2d_all_predicted
recall = t2d_TP/t2d_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.8000926179299894 0.8595269322961422 0.7483461962513782


In [117]:
t2d_P

25396

In [349]:
precision = t2d_best_TP/t2d_all_predicted
recall = t2d_best_TP/t2d_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.8954469867598459 0.9619646329881054 0.8375334698377698


In [457]:
import pickle
with open("/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/t2d_entity_linking_results_0.pkl","rb") as f:
    test_results = pickle.load(f)

In [395]:
def get_tp(result):
    result = result[1]
    pred = []
    lookup = [result[0][1][0], 0]
    for i,x in enumerate(result[1][0]):
        if x in result[0][1]:
            pred = [x, result[1][1][i]]
            break
    for i,x in enumerate(result[1][0]):
        if x == lookup[0]:
            lookup[1] = result[1][1][i]
            break
    final = pred[0] if pred[0]==lookup[0] or (pred[1]*0.8)>lookup[1] else lookup[0]
    if final == result[0][0]:
        return 1
    else:
        return 0

In [249]:
sample_result = sc.parallelize(t2d_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),(x[2][i], x[3][i])) for i,z in enumerate(x[1])])).take(1)

In [259]:
get_tp(sample_result[0])

1

In [396]:
our_tp = sc.parallelize(t2d_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),(x[2][i], x[3][i])) for i,z in enumerate(x[1])]))\
    .map(get_tp).sum()

In [397]:
precision = our_tp/t2d_all_predicted
recall = our_tp/t2d_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.8200896709958533 0.8810094523088056 0.7670499291226965


In [398]:
t2d_TP

19005

In [399]:
our_tp

19480

In [485]:
def get_tp(result):
    result = result[1]
    pred = []
    lookup = [result[0][1][0], 0]
    for i,x in enumerate(result[1][0]):
        if x in result[0][1]:
            pred = [x, result[1][1][i]]
            break
    for i,x in enumerate(result[1][0]):
        if x == lookup[0]:
            lookup[1] = result[1][1][i]
            break
    final = pred[0] if pred[0]==lookup[0] or (pred[1]*0.8)>lookup[1] else lookup[0]
    if final == result[0][0]:
        return (1, result[0][2][final])
    else:
        return (0, result[0][2][final])

In [480]:
sample = sc.parallelize(t2d_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),(x[2][i], x[3][i])) for i,z in enumerate(x[1])])).take(1)

In [484]:
sample[0][1][0][2]

[['Venezuela', 'sovereign state in northern South America', ['Country']],
 ['9357 Venezuela', 'main-belt asteroid', []],
 ['Venezuela', 'city in Cuba', []],
 ['Venezuela', 'Wikimedia disambiguation page', []],
 ['United States of Venezuela', None, ['Country']],
 ['Venezuela', '1958 album by Aldemaro Romero', ['Album']],
 ['Venezuela', 'genus of arachnids', []],
 ['Venezuela', 'parish of Venezuela', []],
 ['Venezuela', 'family name', []],
 ['INDEPABIS', "Venezuela's consumer protection agency", []],
 ['Venezuela', 'sector in Río Piedras Pueblo', []],
 ['Venezuela', 'song', []],
 ['Venezuela', None, []],
 ['San Pedro del Río', None, []],
 ['Tucupido', None, []],
 ['El Callao', None, []],
 ['San José de Guaribe', None, []],
 ['Venezuela', 'photograph in the National Gallery of Art (NGA 208616)', []],
 ['Venezuela', 'scientific article published on 01 September 2004', []],
 ['Venezuela national football team',
  "men's national association football team representing Venezuela",
  ['SoccerC

In [486]:
our_results = sc.parallelize(t2d_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),(x[2][i], x[3][i])) for i,z in enumerate(x[1])]))\
    .map(lambda x:(x[0],get_tp(x)))

In [470]:
lookup_results = t2d_entities_with_candidates.where(F.size('candidates')>=1).rdd.map(lambda x:((x['table_id'],x['i'],x['j']),(x['mention'],x['candidates'],1 if x['wikidata_id'] in [z[0] for z in x['candidates'][:1]] else 0)))

In [487]:
all_results = our_results.join(lookup_results)

In [488]:
errors = all_results.filter(lambda x:x[1][0][0]==0 and x[1][1][-1]==1).collect()

In [489]:
correct = all_results.filter(lambda x:x[1][0][0]==1 and x[1][1][-1]==0).collect()

In [520]:
print(len(errors))
print(len(correct))

490
965


In [475]:
len(correct)

965

In [363]:
len(correct)

1319

In [490]:
correct[0]

(('41480166_0_6681239260286218499', 178, 0),
 ((1, ['Philip the Apostle', 'Christian saint and apostle', ['Saint']]),
  ('philip',
   [['Q827311', 'Philip', 'male given name'],
    ['Q1817', 'Philip the Arab', 'Roman Emperor (204-249)'],
    ['Q733021', 'Philip', 'Wikimedia disambiguation page'],
    ['Q593141',
     'Philip',
     'city in and county seat of Haakon County, South Dakota'],
    ['Q64905', 'Philip', 'Elector Palatine'],
    ['Q312861', 'Philip', 'Italian priest, antipope in 768'],
    ['Q27235644', 'Philip', 'family name'],
    ['Q248817', 'Philip', None],
    ['Q19413497', 'Philip', 'male given name (Піліп)'],
    ['Q2087893', 'Philip', 'Satrap of Sogdiana'],
    ['Q1850877', 'Philip', 'builder of Alexandria on the Indus'],
    ['Q928', 'Philippines', 'sovereign state in Southeast Asia'],
    ['Q3621605', 'Philip', 'son of Antipater'],
    ['Q7183088', 'Philip', 'son of Lysimachus, king of Thrace'],
    ['Q2086016', 'Philip', 'Belgian comics writer'],
    ['Q3380911', '

In [518]:
errors[60]

(('71137051_0_8039724067857124984', 217, 0),
 ((0,
   ['Purple Finch', 'print in the National Gallery of Art (NGA 32145)', []]),
  ('purple finch',
   [['Q27075782', 'Haemorhous purpureus', 'species of bird'],
    ['Q65087408',
     'Purple Finch',
     'print in the National Gallery of Art (NGA 32145)']],
   1)))

In [182]:
len(set([x[0][0] for x in errors]))

200

In [181]:
len(set([x[0][0] for x in correct]))

164

In [180]:
set([x[0][0] for x in errors])

{'10151359_0_8168779773862259178',
 '10579449_0_1681126353774891032',
 '10630177_0_4831842476649004753',
 '11278409_0_3742771475298785475',
 '1146722_1_7558140036342906956',
 '11599512_1_280388135214354946',
 '11688006_0_8123036130090004213',
 '11833461_1_3811022039809817402',
 '12183399_0_6725061928072492226',
 '12193237_0_8699643798888088574',
 '12271141_0_8517913935669973086',
 '12746760_0_6703465836620308483',
 '13719111_1_5719401842463579519',
 '14067031_0_559833072073397908',
 '14311244_0_7604843865524657408',
 '14380604_4_3329235705746762392',
 '1614988_0_8789868670151796042',
 '16767252_0_2409448375013995751',
 '16949304_0_2573050770332101882',
 '18422942_0_3659164506677528063',
 '19073331_0_2742992342272078110',
 '19272019_0_1508498249156534553',
 '19361188_0_1640726405141876003',
 '19654359_0_3020273135042459469',
 '20135078_0_7570343137119682530',
 '21245481_0_8730460088443117515',
 '21329809_0_5526008408364682899',
 '21333456_2_1886495893795687264',
 '21337553_0_88323789996

In [205]:
[[x,t2d_tables_local[x]['pageTitle']] for x in list(set([x[0][0] for x in errors]))]

[['28494901_6_7026744149694237309', 'Scientists Who Believe in God'],
 ['53822652_0_5767892317858575530', "Bryan's Movie Blog"],
 ['28079336_1_3124145965038277571', 'Watch the World with Amish'],
 ['25404227_0_2240631045609013057',
  'Phi Phenomenon -- Top Adventure Films (#1-100)'],
 ['56224555_0_3713922722778385817', 'The Global 2000 - Forbes.com'],
 ['24142265_0_4577466141408796359', 'The GameFAQs Top 100 - GameFAQs'],
 ['19654359_0_3020273135042459469',
  'US Airport Codes by Airport Name | Airportcodes.me'],
 ['55004961_0_2904467548072189860',
  '1Up Travel - Maps of all the G - Category Countries of the world.'],
 ['96203994_0_2127964719640427252',
  'Financial Planning-The Dow Jones Industrial'],
 ['61121469_0_6337620713408906340',
  '1Up Travel - Read about Geography and Geographic info of all the M - Category Countries of the world. Geography and Facts'],
 ['89511064_0_2199624509082573904', 'International Country Codes'],
 ['11278409_0_3742771475298785475',
  '2012 Temkin Rati

In [None]:
[x for x in errors if x[0][0]=='41194422_0_7231546114369966811']

In [165]:
t2d_tables_local = dict(t2d_tables.collect())

In [519]:
t2d_tables_local['71137051_0_8039724067857124984']

{'relation': [['Common Name',
   'Red-throated Loon',
   'Pacific Loon',
   'Common Loon',
   'Pied-billed Grebe',
   'Horned Grebe',
   'Red-necked Grebe',
   'Eared Grebe',
   'Western Grebe',
   "Clark's Grebe",
   'Wandering Albatross',
   'Pink-footed Shearwater',
   'Flesh-footed Shearwater',
   'Sooty Shearwater',
   'Brown Pelican',
   "Brandt's Cormorant",
   'Double-crested Cormorant',
   'Pelagic Cormorant',
   'Magnificent Frigatebird',
   'Great Blue Heron',
   'Great Egret',
   'Snowy Egret',
   'Cattle Egret',
   'Green Heron',
   'Black-crowned Night-Heron',
   'Turkey Vulture',
   'Canada Goose',
   'Brant',
   'Wood Duck',
   'American Wigeon',
   'Mallard',
   'Cinnamon Teal',
   'Northern Shoveler',
   'Northern Pintail',
   'Green-winged Teal',
   'Harlequin Duck',
   'Surf Scoter',
   'White-winged Scoter',
   'Black Scoter',
   'Bufflehead',
   'Common Goldeneye',
   'Hooded Merganser',
   'Common Merganser',
   'Red-breasted Merganser',
   'Ruddy Duck',
   'Ospr

# Limaye

In [183]:
limaye_tables = sc.wholeTextFiles('../../data/efthymiou/LimayeGS/tables_instance').map(lambda x:(x[0].split('/')[-1][:-4],list(csv.reader(x[1].split('\n')))))

In [184]:
limaye_tables.take(1)

[('file589737_0_cols1_rows34',
  [['Aquae Sulis', 'Bath', '', ''],
   ['', '', '', ''],
   ['Isca', 'Caerleon', '', ''],
   ['Venta Silurum', 'Newport', '', ''],
   ['Venta Icenorum', '', '', ''],
   ['', '', '', ''],
   ['', 'Cambridge', '', ''],
   ['', '', '', ''],
   ['Durovernum Cantiacorum', 'Canterbury', '', ''],
   ['', '', '', ''],
   ['Moridunum', 'Carmarthen', '', ''],
   ['Deva', 'Chester', '', ''],
   ['Noviomagus Regneses', 'Chichester', '', ''],
   ['Corinium Dobunnorum', 'Cirencester', '', ''],
   ['Camulodunum', 'Colchester', '', ''],
   ['Durnovaria', 'Dorchester', '', ''],
   ['Dubris', 'Dover', '', ''],
   ['Isca Dumnonnorum', 'Exeter', '', ''],
   ['Glevum', 'Gloucester', '', ''],
   ['Lindinis', 'Ilchester', '', ''],
   ['Ratae Corieltauvum', 'Leicester', '', ''],
   ['', 'Lincoln', '', ''],
   ['Londinium', 'London', '', ''],
   ['', '', '', ''],
   ['', '', '', ''],
   ['', '', '', ''],
   ['Calleva Atrebatum', 'Berkshire', '', ''],
   ['Verulamium', 'St . Alban

In [185]:
sc.wholeTextFiles('../../data/efthymiou/LimayeGS/entities_instance')\
    .map(lambda x:(x[0].split('/')[-1][:-4],list(csv.reader(x[1].split('\n'))))).flatMap(lambda x:[y for y in x[1] if len(y)==3]).count()

5278

In [186]:
limaye_entities = spark.createDataFrame(sc.wholeTextFiles('../../data/efthymiou/LimayeGS/entities_instance')\
    .map(lambda x:(x[0].split('/')[-1][:-4],list(csv.reader(x[1].split('\n')))))\
    .flatMap(lambda x:[Row(table_id=x[0], wikipedia_title=y[0].split('/')[-1],j=0,i=int(y[2]),mention=y[1].replace('&nbsp;','').replace('&nbsp','')) for y in x[1] if len(y)==3]))\
    .join(wikipedia_wikidata_mapping,'wikipedia_title','inner')

In [187]:
limaye_entity_mentions = list(set(limaye_entities.rdd.map(lambda x:x['mention']).collect()))
print(len(limaye_entity_mentions))

1002


In [188]:
limaye_entity_mentions[:100]

['Dilip Kumar',
 'Dev Anand',
 'Lakehead University',
 'Vajiravudh ( Rama VI',
 'The Bronze Bow',
 'The Quest of the Missing Map',
 'Alexander Mackenzie',
 'Raleigh , NC',
 'Philadelphia',
 'Dr No',
 'The Great Mouse Detective',
 'Doctor Zhivago',
 'Jean Lorrah',
 'The Secret Of Red Gate Farm',
 '5 The Secret at Shadow Ranch',
 'Corvallis , OR',
 'The Spy who Loved Me',
 'Leopold I',
 'James Mirrlees',
 'Deep Creek Lake Rec . Area',
 'Sulochana',
 '2 , The Hidden Staircase',
 'Balance Point',
 'European Union',
 'Durovernum Cantiacorum',
 'Mujhse Dosti Karoge',
 'Mansfield College',
 'Another Simpson Clip Show',
 '19 The Quest of the Missing Map',
 'Secret of the Old Clock The',
 'John Pople',
 'Smoky , the Cow Horse',
 'Bunty Aur Babli',
 'Martin Sheen',
 'Homer : Bad Man',
 'College Park , MD',
 'Poisson',
 'Fear Of Flying',
 'Isca Silurum',
 'Dwight D . Eisenhower Presidential Library',
 'The Clue in the Jewel Box',
 'Lord Rayleigh',
 'Where the Wild Things Are',
 'Vasco Nunez de Ba

In [189]:
entity_limaye_candidates = []
i = 0
pool = Pool(processes=16)
while i < len(limaye_entity_mentions):
    print(i)
    tmp = list(tqdm(pool.imap(wikidata_lookup, limaye_entity_mentions[i:i+10000], chunksize=150),total=10000))
    entity_limaye_candidates.extend(tmp)
    i += 10000
pool.close()

0


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [190]:
entity_limaye_candidates_df = spark.createDataFrame(sc.parallelize(entity_limaye_candidates).map(\
                                                lambda x:Row(mention=x[0], candidates=x[1] if isinstance(x[1],list) else [])
                                                ))

In [191]:
limaye_entities_with_candidates = limaye_entities.join(entity_limaye_candidates_df, 'mention','left')

In [192]:
limaye_entities_with_candidates.show()

+--------------------+--------------------+---+---+--------------------+------------+-----------+--------------------+
|             mention|     wikipedia_title|  i|  j|            table_id|wikipedia_id|wikidata_id|          candidates|
+--------------------+--------------------+---+---+--------------------+------------+-----------+--------------------+
|Grandpa vs . Sexu...|Grampa_vs._Sexual...|  5|  0|file219474_12_col...|     3532224|   Q2719593|                  []|
|Homer Goes To Col...|Homer_Goes_to_Col...|  3|  0|file219474_11_col...|     1751158|     Q94318|[[Q94318, Homer G...|
|  Prayer for a Child|  Prayer_for_a_Child| 61|  0|file267997_0_cols...|     7569028|   Q7239020|[[Q7239020, Praye...|
|  Prayer for a Child|  Prayer_for_a_Child| 58|  0|file224977_0_cols...|     7569028|   Q7239020|[[Q7239020, Praye...|
|  Prayer for a Child|  Prayer_for_a_Child| 58|  0|file365069_0_cols...|     7569028|   Q7239020|[[Q7239020, Praye...|
|  Prayer for a Child|  Prayer_for_a_Child| 59| 

In [193]:
sample = limaye_entities_with_candidates.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(limaye_tables).take(1)[0]

In [197]:
sample[1][1]

[['101 Dalmatians', ''],
 ['', ''],
 ['A Bug`s Life', ''],
 ['Aladdin', ''],
 ['Aristocats', ''],
 ['Atlantis', ''],
 ['Bambi', ''],
 ['Beauty &amp; the Beast', ''],
 ['', ''],
 ['', ''],
 ['Brother Bear', ''],
 ['', ''],
 ['Cinderella', ''],
 ['Dinosaur', ''],
 ['Dumbo', ''],
 ['', ''],
 ['Fantasia', ''],
 ['Fantasia 2000', ''],
 ['Finding Nemo', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['Hercules', ''],
 ['Home on the Range', ''],
 ['Hunchback of Notre Dame', ''],
 ['Jungle Book', ''],
 ['Jungle Book 2', ''],
 ['', ''],
 ['Lady &amp; the Tramp', ''],
 ['Lilo &amp; Stitch', ''],
 ['Lion King', ''],
 ['Little Mermaid', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['Monsters Inc .', ''],
 ['', ''],
 ['Mulan', ''],
 ['', ''],
 ['Peter Pan', ''],
 ['Piglet`s Big Movie', ''],
 ['Pinocchio', ''],
 ['', ''],
 ['', ''],
 ['Pocahontas', ''],
 ['', ''],
 ['Rescuers Down Under', ''],
 ['', ''],
 ['', ''],
 ['Sleeping Beauty', ''],
 ['Snow White', ''],
 ['', ''],
 ['Tarza

In [209]:
def build_for_own(x):
    all_processed = []
    table_id = x[0]
    pgTitle = ''
    secTitle = ''
    caption = ''
    headers = ['' for j in range(len(x[1][1][0]))]
    all_entities = x[1][0]
    total_num = len(all_entities)
    chunck_num = int(total_num/max([1,int(total_num/25)]))+1
    while len(all_entities)>0:
        entities = []
        candidate_entities = {}
        labels = []
        cand_for_each = []
        for e in all_entities[:chunck_num]:
            row_i = e[0]
            e_mention = e[2]
            entities.append([[row_i,0],e_mention])
            for cand in e[4]:
                if cand[0] not in candidate_entities:
                    candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
            labels.append(candidate_entities[e[3]][0])
            cand_for_each.append([candidate_entities[cand[0]][0] for cand in e[4]])
            for p,cell in enumerate(x[1][1][row_i][1:]):
                e_mention = cell
                if e_mention!='':
                    entities.append([[row_i,p+1],e_mention])
                    labels.append(0)
                    cand_for_each.append([])
                    
#         entities = [[[z[0],0],z[2]] for z in all_entities[:50]]
#         candidate_entities = {}
#         for z in all_entities[:50]:
#             for cand in z[4]:
#                 if cand[0] not in candidate_entities:
#                     candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
#         labels = [candidate_entities[z[3]][0]  for z in all_entities[:50]]
#         cand_for_each = [[candidate_entities[cand[0]][0] for cand in z[4]] for z in all_entities[:50]]
        tmp_candidate_entities = [0]*len(candidate_entities)
        for k,v in candidate_entities.items():
            tmp_candidate_entities[v[0]] = v[1:]
        all_processed.append([table_id, pgTitle, secTitle, caption, headers, entities, tmp_candidate_entities, labels, cand_for_each])
        all_entities = all_entities[chunck_num:]
    return all_processed

In [210]:
build_for_own(sample)[0]

['file68383_1_cols1_rows60',
 '',
 '',
 '',
 ['', ''],
 [[[58, 0], 'Treasure Planet'],
  [[49, 0], 'Rescuers Down Under'],
  [[10, 0], 'Brother Bear'],
  [[42, 0], 'Peter Pan'],
  [[30, 0], 'Lion King'],
  [[14, 0], 'Dumbo'],
  [[55, 0], 'Tarzan'],
  [[3, 0], 'Aladdin'],
  [[44, 0], 'Pinocchio'],
  [[6, 0], 'Bambi'],
  [[4, 0], 'Aristocats'],
  [[12, 0], 'Cinderella'],
  [[40, 0], 'Mulan'],
  [[23, 0], 'Home on the Range']],
 [['Treasure Planet',
   '2002 American animated science fiction film produced by Walt Disney Feature Animation',
   []],
  ['Treasure Planet', '2002 video game', []],
  ['The Treasure Planet', '1982 film', ['Film']],
  ['Treasure Planet: Battle at Procyon', '2002 video game', ['VideoGame']],
  ['Treasure Planet universe',
   "fictional universe of the ''Treasure Planet'' franchise",
   []],
  ['Treasure Planet: Broadside Blast', '2002 video game', []],
  ['Treasure Planet: Etherium Rescue', '2002 video game', []],
  ['Treasure Planet: Treasure Racer', '2002 video 

In [211]:
limaye_local = limaye_entities_with_candidates.select('table_id','wikidata_id','candidates','i','j','mention')\
                    .where(~F.isnull('candidates'))\
                    .rdd.map(lambda x:[x['table_id'],x['i'],x['j'],x['mention'],x['wikidata_id'],x['candidates']])\
                    .filter(lambda x:x[4] in [z[0] for z in x[5]])\
                    .map(lambda x:(x[0],[x[1:]]))\
                    .reduceByKey(add).join(limaye_tables).flatMap(build_for_own).collect()

In [212]:
limaye_local[70][8]

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [],
 [],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
 [],
 [23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47],
 [],
 [48, 49],
 [],
 [50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69],
 [],
 [],
 [70, 71],
 [],
 [],
 [72],
 [],
 [],
 [73, 74, 75, 76, 77],
 [],
 [78, 79, 80, 81, 82, 83],
 [],
 [],
 [84, 85, 86, 87, 88, 89],
 [],
 [],
 [90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139],
 [],
 [140, 141, 142, 143, 144, 145],
 [],
 []]

In [134]:
def get_labels_and_candidate(tables):
    results = []
    for i,entity in enumerate(tables[5]):
        if len(tables[8][i])==0:
            continue
        results.append(((tables[0],entity[0][0],entity[0][1]),[tables[7][i],tables[8][i],tables[6]]))
    return results

In [213]:
data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
with open(data_dir+'limaye.table_entity_linking.json','w') as f:
    json.dump(limaye_local,f)

In [198]:
limaye_all_predicted = limaye_entities_with_candidates.where(F.size('candidates')>=1).count()
limaye_TP = limaye_entities_with_candidates.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates'][:1]] else 0).sum()
limaye_P = limaye_entities_with_candidates.count()
limaye_best_TP = limaye_entities_with_candidates.where(F.size('candidates')>=1).rdd.map(lambda x:1 if x['wikidata_id'] in [z[0] for z in x['candidates']] else 0).sum()

In [199]:
precision = limaye_TP/limaye_all_predicted
recall = limaye_TP/limaye_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.7047294418406477 0.7831439393939394 0.6405886909372579


In [215]:
import pickle
with open("/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/limaye_entity_linking_results_0.pkl","rb") as f:
    test_results = pickle.load(f)

In [216]:
our_tp = sc.parallelize(limaye_local).flatMap(get_labels_and_candidate)\
    .join(sc.parallelize(test_results).flatMap(lambda x:[((x[0],z[0],z[1]),x[2][i]) for i,z in enumerate(x[1])]))\
    .map(get_tp).sum()

In [219]:
precision = our_tp/limaye_all_predicted
recall = our_tp/limaye_P
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

0.6721346399659139 0.7469223484848485 0.6109604957397367
