In [None]:
!pip install ujson

Collecting ujson
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson
Successfully installed ujson-5.8.0


In [None]:
import random
import os
import gzip
import bz2
import csv
import ujson as json
import glob
import math

In [None]:
def write_file(out_file,mkdir=True,binary=False):
  if mkdir:
    dir = os.path.split(out_file)[0]
    if dir:
      os.makedirs(dir,exist_ok=True)

  if binary:
    if out_file.endswith('.gz'):
      return gzip.open(out_file,'wb')
    elif out_file.endswith('.bz2'):
      return bz2.open(out_file,'wb')
    else:
      return open(out_file,'wb')

  else:
    if out_file.endswith('.gz'):
      return gzip.open(out_file,'wt',encoding='utf-8')
    elif out_file.endswith('.bz2'):
      return bz2.open(out_file,'wt',encoding='utf-8')
    else:
      return open(out_file,'w',encoding='utf-8')




In [None]:
def read_file(in_file,binary=False,errors=None):

  if binary:
    if in_file.endswith('.gz'):
      return gzip.open(in_file,'rb')
    elif in_file.endswith('.bz2'):
      return bz2.open(in_file,'rb')
    else:
      return open(in_file,'rb')

  else:
    if in_file.endswith('.gz'):
      return gzip.open(in_file,'rt',encoding='utf-8',errors=errors)
    elif in_file.endswith('.bz2'):
      return bz2.open(in_file,'rt',encoding='utf-8',errors=errors)
    else:
      return open(in_file,'r',encoding='utf-8',errors=errors)


In [None]:
def shuffle_blocks(it,*,block_size=20000,rand=random):
  assert block_size>=4
  block = []
  for i in it:
    block.append(i)
    if len(block)>=block_size:
      rand.shuffle(block)
      for _ in range(block_size//2):
        yield block.pop(-1)

  rand.shuffle(block)
  for b in block:
    yield b

In [None]:
def expand_file(input,file_pattern='*',files=None):
  if type(input) is str:
    if ':' in input:
      input = input.split(':')
    else:
      input = [input]

  all_inputs = []
  if files is None:
    files = []

  for i in input:
    print(i)
    if i in files:
      continue
    if os.path.isdir(i):
      sub_file = glob.glob(i+"/**/"+file_pattern,recursive=True)
      sub_file = [f for f in sub_file if not os.path.isdir(f)]
      sub_file = [f for f in sub_file if f not in input and f not in files]
      all_inputs.extend(sub_file)
    else:
      all_inputs.append(i)

  all_inputs.sort()
  return all_inputs



In [None]:
def j_lines(input,files=None,limit=0,report_every=100000,*,errors=None,shuffled=None):
  lst = [f for f in expand_file(input,"*.jsonl*",files) if not f.endswith('.lock')]

  return read_lines(lst,limit=limit,report_every=report_every,errors=errors,shuffled=shuffled)


In [None]:
def read_lines(input,limit=0,report_every=100000,*,errors=None,shuffled=None):
  count =0
  input = expand_file(input)
  if shuffled:
    if type(shuffled) != random.Random:
      shuffled = random.Random()

    open_blocks = int(math.ceil(len(input)/32.0))
    for open_i in range(open_blocks):
      open_files = [read_file(i,errors=errors) for i in input[open_i::open_blocks]]
      while len(open_files)>0:
        fx = shuffled.randint(0,len(open_files)-1)
        next_l = open_files[fx].readline()
        if next_l:
          yield next_l
          count +=1

        else:
          open_files[fx].close()
          del open_files[fx]

  else:
    for i in input:
      with read_file(i,errors=errors) as fp:
        for l in fp:
          yield l
          count +=1
          if 0<limit<= count:
            return



## WikiTableQuestions

In [None]:
!git clone https://github.com/ppasupat/WikiTableQuestions.git


Cloning into 'WikiTableQuestions'...
remote: Enumerating objects: 19153, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 19153 (delta 4), reused 10 (delta 3), pack-reused 19139[K
Receiving objects: 100% (19153/19153), 57.71 MiB | 11.38 MiB/s, done.
Resolving deltas: 100% (2100/2100), done.
Updating files: 100% (19914/19914), done.


In [None]:
import csv
import re

In [None]:
csv_dir = os.path.join("WikiTableQuestions", 'csv')
tid2rows = dict()
for dir in os.listdir(csv_dir):
  dirs = os.path.join(csv_dir,dir)
  for file in os.listdir(dirs):
    with read_file(os.path.join(dirs,file)) as cf:
      rows=[]
      for r in csv.reader(cf,doublequote=False,escapechar='\\'):
        rows.append(r)
      tid2rows[f'csv/{dir}/{file}'] = rows

In [None]:
tid2rows

In [None]:
data_dir = os.path.join("WikiTableQuestions", 'data')
id2split = {"ns-39":"train","ns-43":"train","ns-54":"train","ns-68":"train","ns-82":"train","ns-95":"train","ns-97":"train","ns-104":"train","ns-106":"train","ns-118":"train","ns-135":"train","ns-147":"train","ns-210":"train","ns-236":"train","ns-245":"train","ns-258":"train","ns-272":"train","ns-299":"train","ns-302":"train","ns-353":"train","ns-355":"train","ns-374":"train","ns-376":"train","ns-385":"train","ns-399":"train","ns-421":"train","ns-430":"train","ns-439":"train","ns-463":"train","ns-464":"train","ns-487":"train","ns-495":"train","ns-511":"train","ns-515":"train","ns-578":"train","ns-593":"train","ns-624":"train","ns-640":"train","ns-645":"train","ns-676":"train","ns-695":"train","ns-702":"train","ns-858":"train","ns-860":"train","ns-864":"train","ns-883":"train","ns-888":"train","ns-917":"train","ns-931":"train","ns-946":"train","ns-949":"train","ns-952":"train","ns-972":"train","ns-999":"train","ns-1014":"train","ns-1065":"train","ns-1072":"train","ns-1110":"train","ns-1168":"train","ns-1186":"train","ns-1202":"train","ns-1222":"train","ns-1243":"train","ns-1312":"train","ns-1348":"train","ns-1390":"train","ns-1412":"train","ns-1413":"train","ns-1435":"train","ns-1446":"train","ns-1483":"train","ns-1505":"train","ns-1569":"train","ns-1611":"train","ns-1612":"train","ns-1617":"train","ns-1622":"train","ns-1627":"train","ns-1648":"train","ns-1658":"train","ns-1675":"train","ns-1695":"train","ns-1700":"train","ns-1701":"train","ns-1704":"train","ns-1791":"train","ns-1811":"train","ns-1844":"train","ns-1849":"train","ns-1900":"train","ns-1922":"train","ns-1925":"train","ns-1931":"train","ns-1970":"train","ns-2038":"train","ns-2128":"train","ns-2142":"train","ns-2150":"train","ns-2167":"train","ns-2209":"train","ns-2218":"train","ns-2225":"train","ns-2284":"train","ns-2315":"train","ns-2341":"train","ns-2344":"train","ns-2380":"train","ns-2412":"train","ns-2431":"train","ns-2461":"train","ns-2495":"train","ns-2538":"train","ns-2543":"train","ns-2565":"train","ns-2570":"train","ns-2577":"train","ns-2593":"train","ns-2603":"train","ns-2620":"train","ns-2634":"train","ns-2643":"train","ns-2664":"train","ns-2686":"train","ns-2701":"train","ns-2719":"train","ns-2730":"train","ns-2735":"train","ns-2742":"train","ns-2747":"train","ns-2758":"train","ns-2759":"train","ns-2807":"train","ns-2889":"train","ns-2933":"train","ns-2936":"train","ns-2963":"train","ns-2970":"train","ns-2991":"train","ns-3002":"train","ns-3027":"train","ns-3050":"train","ns-3052":"train","ns-3062":"train","ns-3125":"train","ns-3198":"train","ns-3228":"train","ns-3231":"train","ns-3252":"train","ns-3255":"train","ns-3294":"train","ns-3299":"train","ns-3309":"train","ns-3310":"train","ns-3318":"train","ns-3319":"train","ns-3368":"train","ns-3370":"train","ns-3377":"train","ns-3411":"train","ns-3415":"train","ns-3422":"train","ns-3441":"train","ns-3444":"train","ns-3464":"train","ns-3470":"train","ns-3536":"train","nt-7":"train","nt-54":"train","nt-62":"train","nt-95":"train","nt-143":"train","nt-168":"train","nt-172":"train","nt-176":"train","nt-179":"train","nt-223":"train","nt-228":"train","nt-241":"train","nt-246":"train","nt-254":"train","nt-271":"train","nt-290":"train","nt-311":"train","nt-318":"train","nt-339":"train","nt-343":"train","nt-365":"train","nt-370":"train","nt-372":"train","nt-389":"train","nt-427":"train","nt-445":"train","nt-452":"train","nt-473":"train","nt-484":"train","nt-508":"train","nt-510":"train","nt-517":"train","nt-534":"train","nt-538":"train","nt-553":"train","nt-575":"train","nt-580":"train","nt-583":"train","nt-641":"train","nt-665":"train","nt-667":"train","nt-676":"train","nt-711":"train","nt-714":"train","nt-715":"train","nt-718":"train","nt-721":"train","nt-724":"train","nt-727":"train","nt-750":"train","nt-761":"train","nt-798":"train","nt-803":"train","nt-915":"train","nt-931":"train","nt-948":"train","nt-960":"train","nt-991":"train","nt-1015":"train","nt-1026":"train","nt-1039":"train","nt-1066":"train","nt-1076":"train","nt-1081":"train","nt-1097":"train","nt-1109":"train","nt-1151":"train","nt-1177":"train","nt-1193":"train","nt-1218":"train","nt-1247":"train","nt-1261":"train","nt-1300":"train","nt-1339":"train","nt-1383":"train","nt-1385":"train","nt-1387":"train","nt-1417":"train","nt-1428":"train","nt-1450":"train","nt-1474":"train","nt-1500":"train","nt-1520":"train","nt-1530":"train","nt-1541":"train","nt-1557":"train","nt-1562":"train","nt-1563":"train","nt-1570":"train","nt-1574":"train","nt-1579":"train","nt-1580":"train","nt-1603":"train","nt-1619":"train","nt-1620":"train","nt-1657":"train","nt-1668":"train","nt-1691":"train","nt-1742":"train","nt-1749":"train","nt-1767":"train","nt-1770":"train","nt-1790":"train","nt-1809":"train","nt-1812":"train","nt-1819":"train","nt-1820":"train","nt-1835":"train","nt-1847":"train","nt-1872":"train","nt-1884":"train","nt-1920":"train","nt-1929":"train","nt-1947":"train","nt-1992":"train","nt-1999":"train","nt-2002":"train","nt-2004":"train","nt-2022":"train","nt-2038":"train","nt-2040":"train","nt-2054":"train","nt-2063":"train","nt-2076":"train","nt-2091":"train","nt-2110":"train","nt-2122":"train","nt-2124":"train","nt-2170":"train","nt-2184":"train","nt-2208":"train","nt-2239":"train","nt-2285":"train","nt-2286":"train","nt-2293":"train","nt-2314":"train","nt-2416":"train","nt-2424":"train","nt-2434":"train","nt-2445":"train","nt-2450":"train","nt-2454":"train","nt-2462":"train","nt-2463":"train","nt-2472":"train","nt-2516":"train","nt-2529":"train","nt-2552":"train","nt-2577":"train","nt-2578":"train","nt-2579":"train","nt-2668":"train","nt-2705":"train","nt-2729":"train","nt-2761":"train","nt-2777":"train","nt-2788":"train","nt-2842":"train","nt-2843":"train","nt-2855":"train","nt-2866":"train","nt-2874":"train","nt-2898":"train","nt-2932":"train","nt-2933":"train","nt-2950":"train","nt-2957":"train","nt-3010":"train","nt-3018":"train","nt-3057":"train","nt-3074":"train","nt-3081":"train","nt-3131":"train","nt-3136":"train","nt-3140":"train","nt-3142":"train","nt-3175":"train","nt-3178":"train","nt-3209":"train","nt-3224":"train","nt-3225":"train","nt-3243":"train","nt-3264":"train","nt-3304":"train","nt-3323":"train","nt-3336":"train","nt-3347":"train","nt-3351":"train","nt-3357":"train","nt-3370":"train","nt-3390":"train","nt-3422":"train","nt-3424":"train","nt-3437":"train","nt-3523":"train","nt-3551":"train","nt-3560":"train","nt-3571":"train","nt-3573":"train","nt-3615":"train","nt-3627":"train","nt-3647":"train","nt-3659":"train","nt-3685":"train","nt-3709":"train","nt-3752":"train","nt-3760":"train","nt-3763":"train","nt-3766":"train","nt-3772":"train","nt-3773":"train","nt-3775":"train","nt-3781":"train","nt-3826":"train","nt-3844":"train","nt-3856":"train","nt-3889":"train","nt-3963":"train","nt-3977":"train","nt-3983":"train","nt-4031":"train","nt-4086":"train","nt-4116":"train","nt-4133":"train","nt-4156":"train","nt-4178":"train","nt-4274":"train","nt-4290":"train","nt-4292":"train","nt-4301":"train","nt-4306":"train","nt-4321":"train","nt-4333":"train","nt-4372":"train","nt-4409":"train","nt-4417":"train","nt-4420":"train","nt-4433":"train","nt-4438":"train","nt-4463":"train","nt-4486":"train","nt-4510":"train","nt-4521":"train","nt-4546":"train","nt-4596":"train","nt-4620":"train","nt-4646":"train","nt-4664":"train","nt-4672":"train","nt-4703":"train","nt-4714":"train","nt-4715":"train","nt-4728":"train","nt-4741":"train","nt-4750":"train","nt-4759":"train","nt-4798":"train","nt-4799":"train","nt-4802":"train","nt-4816":"train","nt-4862":"train","nt-4874":"train","nt-4877":"train","nt-4929":"train","nt-4946":"train","nt-4973":"train","nt-4974":"train","nt-4998":"train","nt-4999":"train","nt-5005":"train","nt-5026":"train","nt-5063":"train","nt-5138":"train","nt-5144":"train","nt-5228":"train","nt-5229":"train","nt-5252":"train","nt-5285":"train","nt-5294":"train","nt-5322":"train","nt-5323":"train","nt-5331":"train","nt-5333":"train","nt-5337":"train","nt-5339":"train","nt-5356":"train","nt-5364":"train","nt-5411":"train","nt-5412":"train","nt-5423":"train","nt-5476":"train","nt-5495":"train","nt-5503":"train","nt-5504":"train","nt-5539":"train","nt-5543":"train","nt-5551":"train","nt-5560":"train","nt-5579":"train","nt-5618":"train","nt-5621":"train","nt-5625":"train","nt-5634":"train","nt-5679":"train","nt-5716":"train","nt-5730":"train","nt-5733":"train","nt-5740":"train","nt-5767":"train","nt-5799":"train","nt-5815":"train","nt-5816":"train","nt-5899":"train","nt-5907":"train","nt-5916":"train","nt-5933":"train","nt-5949":"train","nt-5954":"train","nt-5968":"train","nt-5969":"train","nt-5984":"train","nt-6093":"train","nt-6103":"train","nt-6105":"train","nt-6115":"train","nt-6143":"train","nt-6184":"train","nt-6196":"train","nt-6218":"train","nt-6225":"train","nt-6247":"train","nt-6251":"train","nt-6253":"train","nt-6283":"train","nt-6291":"train","nt-6306":"train","nt-6329":"train","nt-6332":"train","nt-6446":"train","nt-6450":"train","nt-6453":"train","nt-6458":"train","nt-6529":"train","nt-6568":"train","nt-6592":"train","nt-6593":"train","nt-6627":"train","nt-6629":"train","nt-6694":"train","nt-6729":"train","nt-6750":"train","nt-6796":"train","nt-6817":"train","nt-6829":"train","nt-6830":"train","nt-6837":"train","nt-6847":"train","nt-6876":"train","nt-6886":"train","nt-6893":"train","nt-6904":"train","nt-6914":"train","nt-6936":"train","nt-6941":"train","nt-6981":"train","nt-7039":"train","nt-7077":"train","nt-7092":"train","nt-7095":"train","nt-7099":"train","nt-7129":"train","nt-7131":"train","nt-7133":"train","nt-7154":"train","nt-7197":"train","nt-7213":"train","nt-7243":"train","nt-7290":"train","nt-7292":"train","nt-7318":"train","nt-7332":"train","nt-7333":"train","nt-7346":"train","nt-7418":"train","nt-7422":"train","nt-7426":"train","nt-7489":"train","nt-7523":"train","nt-7528":"train","nt-7534":"train","nt-7537":"train","nt-7562":"train","nt-7591":"train","nt-7612":"train","nt-7631":"train","nt-7644":"train","nt-7647":"train","nt-7648":"train","nt-7680":"train","nt-7686":"train","nt-7687":"train","nt-7702":"train","nt-7711":"train","nt-7712":"train","nt-7720":"train","nt-7737":"train","nt-7755":"train","nt-7763":"train","nt-7767":"train","nt-7794":"train","nt-7818":"train","nt-7823":"train","nt-7863":"train","nt-7867":"train","nt-7879":"train","nt-7940":"train","nt-7979":"train","nt-7992":"train","nt-8028":"train","nt-8069":"train","nt-8088":"train","nt-8133":"train","nt-8152":"train","nt-8169":"train","nt-8212":"train","nt-8219":"train","nt-8239":"train","nt-8245":"train","nt-8255":"train","nt-8287":"train","nt-8366":"train","nt-8415":"train","nt-8417":"train","nt-8462":"train","nt-8495":"train","nt-8521":"train","nt-8535":"train","nt-8536":"train","nt-8542":"train","nt-8554":"train","nt-8578":"train","nt-8596":"train","nt-8604":"train","nt-8669":"train","nt-8683":"train","nt-8693":"train","nt-8717":"train","nt-8721":"train","nt-8741":"train","nt-8748":"train","nt-8782":"train","nt-8800":"train","nt-8828":"train","nt-8886":"train","nt-8902":"train","nt-8920":"train","nt-8921":"train","nt-9001":"train","nt-9003":"train","nt-9022":"train","nt-9027":"train","nt-9029":"train","nt-9050":"train","nt-9062":"train","nt-9109":"train","nt-9188":"train","nt-9190":"train","nt-9231":"train","nt-9244":"train","nt-9267":"train","nt-9300":"train","nt-9303":"train","nt-9316":"train","nt-9327":"train","nt-9335":"train","nt-9355":"train","nt-9359":"train","nt-9360":"train","nt-9411":"train","nt-9436":"train","nt-9468":"train","nt-9480":"train","nt-9494":"train","nt-9526":"train","nt-9529":"train","nt-9551":"train","nt-9612":"train","nt-9642":"train","nt-9660":"train","nt-9672":"train","nt-9717":"train","nt-9723":"train","nt-9727":"train","nt-9728":"train","nt-9745":"train","nt-9771":"train","nt-9782":"train","nt-9790":"train","nt-9827":"train","nt-9835":"train","nt-9847":"train","nt-9850":"train","nt-9889":"train","nt-9892":"train","nt-9903":"train","nt-9932":"train","nt-9933":"train","nt-9934":"train","nt-9973":"train","nt-9990":"train","nt-10023":"train","nt-10026":"train","nt-10027":"train","nt-10043":"train","nt-10062":"train","nt-10085":"train","nt-10098":"train","nt-10119":"train","nt-10154":"train","nt-10157":"train","nt-10166":"train","nt-10178":"train","nt-10211":"train","nt-10246":"train","nt-10247":"train","nt-10267":"train","nt-10275":"train","nt-10290":"train","nt-10303":"train","nt-10380":"train","nt-10407":"train","nt-10420":"train","nt-10424":"train","nt-10578":"train","nt-10613":"train","nt-10615":"train","nt-10770":"train","nt-10793":"train","nt-10800":"train","nt-10809":"train","nt-10820":"train","nt-10833":"train","nt-10847":"train","nt-10883":"train","nt-10916":"train","nt-10968":"train","nt-11029":"train","nt-11094":"train","nt-11109":"train","nt-11159":"train","nt-11172":"train","nt-11191":"train","nt-11245":"train","nt-11293":"train","nt-11298":"train","nt-11318":"train","nt-11371":"train","nt-11397":"train","nt-11425":"train","nt-11431":"train","nt-11460":"train","nt-11480":"train","nt-11492":"train","nt-11506":"train","nt-11508":"train","nt-11516":"train","nt-11525":"train","nt-11535":"train","nt-11540":"train","nt-11585":"train","nt-11593":"train","nt-11594":"train","nt-11607":"train","nt-11664":"train","nt-11669":"train","nt-11682":"train","nt-11740":"train","nt-11830":"train","nt-11890":"train","nt-11925":"train","nt-11963":"train","nt-11965":"train","nt-11998":"train","nt-12001":"train","nt-12060":"train","nt-12065":"train","nt-12097":"train","nt-12105":"train","nt-12135":"train","nt-12140":"train","nt-12148":"train","nt-12164":"train","nt-12169":"train","nt-12190":"train","nt-12217":"train","nt-12226":"train","nt-12253":"train","nt-12260":"train","nt-12287":"train","nt-12298":"train","nt-12299":"train","nt-12309":"train","nt-12330":"train","nt-12339":"train","nt-12362":"train","nt-12386":"train","nt-12388":"train","nt-12410":"train","nt-12415":"train","nt-12452":"train","nt-12467":"train","nt-12478":"train","nt-12480":"train","nt-12515":"train","nt-12532":"train","nt-12565":"train","nt-12575":"train","nt-12591":"train","nt-12599":"train","nt-12649":"train","nt-12661":"train","nt-12697":"train","nt-12709":"train","nt-12742":"train","nt-12749":"train","nt-12772":"train","nt-12856":"train","nt-12872":"train","nt-12878":"train","nt-12885":"train","nt-12909":"train","nt-12918":"train","nt-12959":"train","nt-12977":"train","nt-12983":"train","nt-13019":"train","nt-13107":"train","nt-13118":"train","nt-13137":"train","nt-13185":"train","nt-13198":"train","nt-13222":"train","nt-13241":"train","nt-13243":"train","nt-13248":"train","nt-13284":"train","nt-13323":"train","nt-13336":"train","nt-13351":"train","nt-13376":"train","nt-13377":"train","nt-13390":"train","nt-13425":"train","nt-13470":"train","nt-13472":"train","nt-13549":"train","nt-13575":"train","nt-13599":"train","nt-13607":"train","nt-13621":"train","nt-13638":"train","nt-13664":"train","nt-13680":"train","nt-13699":"train","nt-13726":"train","nt-13740":"train","nt-13744":"train","nt-13751":"train","nt-13756":"train","nt-13785":"train","nt-13786":"train","nt-13802":"train","nt-13822":"train","nt-13825":"train","nt-13890":"train","nt-13907":"train","nt-13921":"train","nt-13929":"train","nt-13941":"train","nt-14030":"train","nt-14085":"train","nt-14106":"train","nt-14108":"train","nt-14109":"train","nt-14127":"train","ns-140":"dev","ns-557":"dev","ns-763":"dev","ns-1278":"dev","ns-1807":"dev","ns-2195":"dev","ns-2250":"dev","ns-2484":"dev","ns-2852":"dev","ns-2899":"dev","ns-3324":"dev","ns-3336":"dev","ns-3505":"dev","nu-52":"dev","nu-69":"dev","nu-129":"dev","nu-173":"dev","nu-214":"dev","nu-270":"dev","nu-292":"dev","nu-301":"dev","nu-319":"dev","nu-361":"dev","nu-416":"dev","nu-473":"dev","nu-497":"dev","nu-573":"dev","nu-604":"dev","nu-692":"dev","nu-724":"dev","nu-758":"dev","nu-795":"dev","nu-820":"dev","nu-862":"dev","nu-877":"dev","nu-894":"dev","nu-960":"dev","nu-963":"dev","nu-1003":"dev","nu-1019":"dev","nu-1066":"dev","nu-1067":"dev","nu-1095":"dev","nu-1210":"dev","nu-1247":"dev","nu-1320":"dev","nu-1507":"dev","nu-1533":"dev","nu-1563":"dev","nu-1595":"dev","nu-1597":"dev","nu-1682":"dev","nu-1829":"dev","nu-1917":"dev","nu-1964":"dev","nu-1968":"dev","nu-2124":"dev","nu-2159":"dev","nu-2182":"dev","nu-2354":"dev","nu-2368":"dev","nu-2372":"dev","nu-2453":"dev","nu-2470":"dev","nu-2483":"dev","nu-2577":"dev","nu-2747":"dev","nu-2849":"dev","nu-2926":"dev","nu-2939":"dev","nu-3028":"dev","nu-3064":"dev","nu-3089":"dev","nu-3114":"dev","nu-3140":"dev","nu-3192":"dev","nu-3212":"dev","nu-3239":"dev","nu-3383":"dev","nu-3475":"dev","nu-3510":"dev","nu-3595":"dev","nu-3770":"dev","nu-3788":"dev","nu-3841":"dev","nu-3902":"dev","nu-3944":"dev","nu-4063":"dev","nu-4094":"dev","nu-4187":"dev","nu-4203":"dev","nu-4257":"dev","nu-4287":"dev","nt-166":"dev","nt-645":"dev","nt-770":"dev","nt-884":"dev","nt-969":"dev","nt-1142":"dev","nt-1959":"dev","nt-2538":"dev","nt-2548":"dev","nt-3414":"dev","nt-3507":"dev","nt-5031":"dev","nt-5958":"dev","nt-6476":"dev","nt-6625":"dev","nt-6761":"dev","nt-7547":"dev","nt-7678":"dev","nt-7851":"dev","nt-8084":"dev","nt-8114":"dev","nt-8372":"dev","nt-8515":"dev","nt-8599":"dev","nt-8720":"dev","nt-9619":"dev","nt-10161":"dev","nt-10370":"dev","nt-10985":"dev","nt-11432":"dev","nt-13221":"dev","ns-138":"test","ns-608":"test","ns-1516":"test","ns-1580":"test","ns-1768":"test","ns-2040":"test","ns-2116":"test","ns-2324":"test","ns-2674":"test","ns-3210":"test","ns-3311":"test","ns-3462":"test","nu-5":"test","nu-55":"test","nu-79":"test","nu-97":"test","nu-99":"test","nu-269":"test","nu-271":"test","nu-279":"test","nu-283":"test","nu-288":"test","nu-297":"test","nu-325":"test","nu-336":"test","nu-372":"test","nu-427":"test","nu-475":"test","nu-484":"test","nu-492":"test","nu-494":"test","nu-594":"test","nu-633":"test","nu-643":"test","nu-650":"test","nu-654":"test","nu-701":"test","nu-716":"test","nu-798":"test","nu-819":"test","nu-850":"test","nu-854":"test","nu-1002":"test","nu-1013":"test","nu-1021":"test","nu-1023":"test","nu-1102":"test","nu-1122":"test","nu-1131":"test","nu-1135":"test","nu-1211":"test","nu-1225":"test","nu-1231":"test","nu-1232":"test","nu-1308":"test","nu-1312":"test","nu-1377":"test","nu-1439":"test","nu-1456":"test","nu-1457":"test","nu-1464":"test","nu-1468":"test","nu-1501":"test","nu-1530":"test","nu-1544":"test","nu-1555":"test","nu-1572":"test","nu-1583":"test","nu-1622":"test","nu-1665":"test","nu-1685":"test","nu-1688":"test","nu-1693":"test","nu-1741":"test","nu-1742":"test","nu-1746":"test","nu-1760":"test","nu-1762":"test","nu-1832":"test","nu-1859":"test","nu-1875":"test","nu-1884":"test","nu-1906":"test","nu-1914":"test","nu-1947":"test","nu-1954":"test","nu-2008":"test","nu-2068":"test","nu-2078":"test","nu-2082":"test","nu-2107":"test","nu-2114":"test","nu-2147":"test","nu-2210":"test","nu-2247":"test","nu-2266":"test","nu-2366":"test","nu-2370":"test","nu-2401":"test","nu-2463":"test","nu-2488":"test","nu-2489":"test","nu-2511":"test","nu-2531":"test","nu-2565":"test","nu-2589":"test","nu-2625":"test","nu-2626":"test","nu-2719":"test","nu-2738":"test","nu-2743":"test","nu-2748":"test","nu-2836":"test","nu-2844":"test","nu-2923":"test","nu-2925":"test","nu-2928":"test","nu-2953":"test","nu-2980":"test","nu-3029":"test","nu-3049":"test","nu-3107":"test","nu-3115":"test","nu-3141":"test","nu-3187":"test","nu-3205":"test","nu-3224":"test","nu-3348":"test","nu-3389":"test","nu-3499":"test","nu-3511":"test","nu-3515":"test","nu-3522":"test","nu-3542":"test","nu-3561":"test","nu-3576":"test","nu-3610":"test","nu-3646":"test","nu-3692":"test","nu-3718":"test","nu-3721":"test","nu-3726":"test","nu-3743":"test","nu-3763":"test","nu-3769":"test","nu-3797":"test","nu-3833":"test","nu-3875":"test","nu-3903":"test","nu-3952":"test","nu-3978":"test","nu-3995":"test","nu-4015":"test","nu-4037":"test","nu-4067":"test","nu-4147":"test","nu-4160":"test","nu-4182":"test","nu-4185":"test","nu-4194":"test","nu-4210":"test","nu-4228":"test","nu-4251":"test","nu-4253":"test","nu-4284":"test","nu-4342":"test","nt-103":"test","nt-245":"test","nt-669":"test","nt-910":"test","nt-1035":"test","nt-1055":"test","nt-1312":"test","nt-1357":"test","nt-1485":"test","nt-1585":"test","nt-1886":"test","nt-1937":"test","nt-1938":"test","nt-1996":"test","nt-2487":"test","nt-2492":"test","nt-2532":"test","nt-2654":"test","nt-2742":"test","nt-2808":"test","nt-3072":"test","nt-3941":"test","nt-4220":"test","nt-4300":"test","nt-4940":"test","nt-5035":"test","nt-5038":"test","nt-5064":"test","nt-5357":"test","nt-5653":"test","nt-6309":"test","nt-6485":"test","nt-6553":"test","nt-6624":"test","nt-6657":"test","nt-6681":"test","nt-6771":"test","nt-7203":"test","nt-7895":"test","nt-7932":"test","nt-7967":"test","nt-7999":"test","nt-8087":"test","nt-8249":"test","nt-8289":"test","nt-8815":"test","nt-8928":"test","nt-9167":"test","nt-9567":"test","nt-9608":"test","nt-9632":"test","nt-9680":"test","nt-9823":"test","nt-10164":"test","nt-10414":"test","nt-10525":"test","nt-10595":"test","nt-11170":"test","nt-11462":"test","nt-11564":"test","nt-11796":"test","nt-11865":"test","nt-12224":"test","nt-12329":"test","nt-12472":"test","nt-12587":"test","nt-12677":"test","nt-12801":"test","nt-13407":"test","nt-13459":"test","nt-13588":"test","nt-13653":"test","nt-13675":"test","nt-13770":"test","nt-13995":"test"}

splits ={}
for split in ['train', 'dev' ,'test']:
  splits[split] = write_file(os.path.join("./", f'{split}_lookup.jsonl.gz'))

In [None]:
def normalize_data(data):
  if data == 'Muneca brava':
    data = 'Muñeca brava'
  elif data == 'Costa Rican':
    data = 'Costa Rica'

  spaces = re.compile(r'\s+', flags=re.MULTILINE)
  non_word = re.compile(r'\W+', flags=re.MULTILINE)

  data_r = re.sub(spaces, ' ', re.sub(non_word, ' ', data)).strip()
  if not data_r:
    data_r = re.sub(spaces, ' ',data).strip()

  return data_r

In [None]:
data_wtq = {}

In [None]:
for in_file in ['training.tsv', 'pristine-seen-tables.tsv','pristine-unseen-tables.tsv']:
  for ind,l in enumerate(read_lines(os.path.join(data_dir,in_file))):
    parts = l.strip().split('\t')
    assert len(parts) ==4

    if ind==0:
      continue

    id = parts[0]
    if id not in id2split:
      continue

    split = id2split[id]
    query = parts[1].replace('\\n', '\n').replace('\\p', '|').replace('\\\\', '\\')
    table_id = parts[2]
    answers = [p.replace('\\n', '\n').replace('\\p', '|').replace('\\\\', '\\') for p in parts[3].split('|')]
    norm_answers = [normalize_data(ans) for ans in answers]
    all_rows = tid2rows[table_id]
    header = all_rows[0]
    rows = all_rows[1:]

    data_wtq[table_id] = all_rows

    target_columns = set()
    matched_answers = set()

    for rind,row in enumerate(all_rows):
      for cind,cell in enumerate(row):
        if normalize_data(cell) in norm_answers:
          target_columns.add(cind)
          matched_answers.add(cell)

    if len(target_columns)==0:
      for rind,row in enumerate(all_rows):
        for cind,cell in enumerate(row):
          ncell = normalize_data(cell)
          if any([ans in ncell and len(ans)/len(ncell) >= 0.75 for ans in norm_answers]):
            target_columns.add(cind)
            matched_answers.add(cell)


    file_out = splits[split]
    if len(target_columns) ==0:
      pass
    elif len(target_columns) >1:
      pass
    else:
      data = dict()
      data['id'] = id
      data['table_id'] = table_id
      data['question'] = query
      data['header'] = header
      data['target_column'] = list(target_columns)[0]
      answers = list(matched_answers)
      answers.sort()
      data['answers'] = answers
      data['rows'] = rows

      print(data)
      file_out.write(json.dumps(data)+'\n')

for f in splits.values():
  f.close()







WikiTableQuestions/data/training.tsv
{'id': 'nt-7', 'table_id': 'csv/204-csv/341.csv', 'question': 'which is deeper, lake tuz or lake palas tuzla?', 'header': ['Name in English', 'Name in Turkish', 'Area (km2)', 'Depth', 'Location (districts and/or provinces)'], 'target_column': 0, 'answers': ['Lake Palas Tuzla'], 'rows': [['Lake Van', 'Van Gölü', '3755 km2', '451 m', 'Van, Bitlis'], ['Lake Tuz', 'Tuz Gölü', '1500 km2', '2 m', 'Aksaray, Ankara, Konya'], ['Lake Beyşehir', 'Beyşehir Gölü', '656 km2', '10 m', 'Beyşehir in Konya, Isparta'], ['Lake Eğirdir', 'Eğirdir Gölü', '482 km2', '', 'Isparta'], ['Lake İznik', 'İznik Gölü', '308 km2', '', 'İznik in Bursa, Yalova'], ['Lake Burdur', 'Burdur Gölü', '200 km2', '', 'Burdur, Isparta'], ['Lake Manyas', 'Manyas Gölü', '166 km2', '', 'Balıkesir'], ['Lake Acıgöl', 'Acıgöl', '153 km2', '', 'Denizli, Afyonkarahisar'], ['Lake Uluabat', 'Uluabat Gölü', '134 km2', '1–2 m', 'Bursa'], ['Lake Çıldır', 'Çıldır Gölü', '115 km2', '', 'Ardahan, Kars'], ['La