In [None]:
!pip install ujson

Collecting ujson
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson
Successfully installed ujson-5.8.0


In [None]:
import random
import os
import gzip
import bz2
import csv
import ujson as json
import glob
import math

In [None]:
def write_file(out_file,mkdir=True,binary=False):
  if mkdir:
    dir = os.path.split(out_file)[0]
    if dir:
      os.makedirs(dir,exist_ok=True)

  if binary:
    if out_file.endswith('.gz'):
      return gzip.open(out_file,'wb')
    elif out_file.endswith('.bz2'):
      return bz2.open(out_file,'wb')
    else:
      return open(out_file,'wb')

  else:
    if out_file.endswith('.gz'):
      return gzip.open(out_file,'wt',encoding='utf-8')
    elif out_file.endswith('.bz2'):
      return bz2.open(out_file,'wt',encoding='utf-8')
    else:
      return open(out_file,'w',encoding='utf-8')




In [None]:
def read_file(in_file,binary=False,errors=None):

  if binary:
    if in_file.endswith('.gz'):
      return gzip.open(in_file,'rb')
    elif in_file.endswith('.bz2'):
      return bz2.open(in_file,'rb')
    else:
      return open(in_file,'rb')

  else:
    if in_file.endswith('.gz'):
      return gzip.open(in_file,'rt',encoding='utf-8',errors=errors)
    elif in_file.endswith('.bz2'):
      return bz2.open(in_file,'rt',encoding='utf-8',errors=errors)
    else:
      return open(in_file,'r',encoding='utf-8',errors=errors)


In [None]:
path1 = "data_table_mcq.jsonl"
path2 = "data_wtq.jsonl"
path3 = "wiki_sql_data_lookup.jsonl"
path4 = "wiki_sql_data_agg.jsonl"


In [None]:
! pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import re
from rank_bm25 import BM25Okapi

In [None]:
def pre_process(path):
  di = {}
  punc_pattern = r"[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]"
  with read_file(path) as fp:

    for n1,line in enumerate(fp):
      data = json.loads(line)
      for k,v in data.items():
        qid = k
        header = v[0]
        rows = v[1:]
        # print(qid,header,rows)
        header1 = []
        for h in header:
          res = re.sub(punc_pattern,' ',h)
          res = re.sub("\s+",' ',res)
          header1.extend(res.lower().split())

        rows1 = []
        for i in rows:
          for j in i:
            res = re.sub(punc_pattern,' ',j)
            res = re.sub("\s+",' ',res)
            rows1.extend(res.lower().split())

        header1.extend(rows1)
        # print(header1)
        di[k] = header1
    return di


In [None]:
def preprocess_query(query):
  punc_pattern = r"[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]"
  res = re.sub(punc_pattern,' ',query)
  res = re.sub("\s+",' ',res)
  tokenized_query = res.lower().split()
  return tokenized_query

In [None]:
def ranking_docs(query,di):
  tokenized_query = preprocess_query(query)
  bm25 = BM25Okapi(di.values())
  scores = bm25.get_scores(tokenized_query)
  ranked_documents = dict(sorted(zip(di.keys(), scores), key=lambda x: x[1], reverse=True))
  return ranked_documents



In [None]:
def BM25(query,top=300,paths=[path1,path2,path3]):
  di1 = pre_process(paths[0])
  di2 = pre_process(paths[1])
  di3 = pre_process(paths[2])
  ranked_doc1 = ranking_docs(query,di1)
  ranked_doc2 = ranking_docs(query,di2)
  ranked_doc3 = ranking_docs(query,di3)
  result = {**ranked_doc1,**ranked_doc2,**ranked_doc3}
  final_result = dict(list(sorted(result.items(), key=lambda x: x[1], reverse=True))[:top])

  tables = {}
  for i in paths:
    with read_file(i) as fp:
      for n1,line in enumerate(fp):
        data = json.loads(line)
        for k,v in data.items():
          if(k in final_result.keys()):
            tables[k] = v

  return tables

In [None]:
query = "what is the temperature?"

In [None]:
tables = BM25(query)

In [None]:
tables

{'auto-18': [['Term',
   'Type',
   'POS (most common)',
   'WN2.0 Name',
   'WordNet SenseKey',
   'WN2.0 Synset',
   'WordNet gloss',
   'WordNet example usage'],
  ['ability',
   '?',
   'n',
   'ability_n1',
   'ability%1:07:00::',
   '104904666',
   'the quality of being able to perform; a quality that permits or facilitates achievement or accomplishment',
   ''],
  ['able',
   '?',
   'adj',
   'able_a1',
   'able%3:00:00::',
   '300001740',
   "(usually followed by `to') having the necessary means or skill or know-how or authority to do something;",
   'able to swim; she was able to program her computer; we were at last able to buy a car; able to get a grant for the project'],
  ['about',
   '?',
   'r',
   'about_r1',
   'about%4:02:00::',
   '400006660',
   '(of quantities) imprecise but fairly close to correct;',
   "lasted approximately an hour; in just about a minute; he's about 30 years old; I've had about all I can stand; we meet about once a month; some forty people came

In [None]:
len(tables)

300