# WikiTableQuestions

### Uploading Dataset to huggingface


In [None]:
import os

from huggingface_hub import HfApi

api = HfApi()
api.upload_large_folder(
    folder_path="wiki_table_questions/",
    repo_id="TableSenseAI/WikiTableQuestions",
    repo_type="dataset",
)


### Preprocessing Dataset

In [None]:
import csv
import json
import os

input_tsv = 'WikiTableQuestions-data/data/pristine-unseen-tables.tsv'         # Replace with your input file
output_json = 'examples-test.json'

data = []

with open(input_tsv, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for idx, row in enumerate(reader):
        base_path, _ = os.path.splitext(row["context"])
        entry = {
            "id": f"nu-{idx}",
            "utterance": row["utterance"],
            "target_value": row["targetValue"],
            "context": {
                "csv": base_path + ".csv",
                "html": base_path + ".html",
                "tsv": base_path + ".tsv"
            }
        }
        data.append(entry)

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)

print(f"Wrote {len(data)} entries to {output_json}")


# FreeformTableQA

### Preprocessing Dataset

In [3]:
import json
import csv
import os


input_dev = 'freeform-table-qa/raw-data/fetaQA-v1_test.jsonl'
output_json = 'freeform-table-qa/examples/examples-test.json'

csv_dir = 'freeform-table-qa/examples/tables/csv'
tsv_dir = 'freeform-table-qa/examples/tables/tsv'
html_dir = 'freeform-table-qa/examples/tables/html'
# dict_keys(['feta_id', 'table_source_json', 'page_wikipedia_url', 'table_page_title', 'table_section_title', 'table_array', 'highlighted_cell_ids', 'question', 'answer'])
data = []
with open(input_dev, 'r') as f:
    for line in f:
        data.append(json.loads(line))


result = []
for item in data:
    example_id = f"feta_{item["feta_id"]}"

    # Define output paths relative to examples/
    csv_path = f"examples/tables/csv-test/{example_id}.csv"
    tsv_path = f"examples/tables/tsv-test/{example_id}.tsv"
    html_path = f"examples/tables/html-test/{example_id}.html"

    # Absolute paths for writing files
    csv_full = os.path.join('freeform-table-qa', csv_path)
    tsv_full = os.path.join('freeform-table-qa', tsv_path)
    html_full = os.path.join('freeform-table-qa', html_path)

    # Write CSV
    with open(csv_full, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(item["table_array"])

    # Write TSV
    with open(tsv_full, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(item["table_array"])

    # Write HTML
    with open(html_full, 'w', encoding='utf-8') as f:
        f.write('<table border="1">\n')
        for row in item["table_array"]:
            f.write('  <tr>' + ''.join(f'<td>{cell}</td>' for cell in row) + '</tr>\n')
        f.write('</table>\n')

    new_item = {
        "id": example_id,
        "utterance": item["question"],
        "target_value": item["answer"],
        "context": {
            "csv": csv_path,
            "html": tsv_path,
            "tsv": html_path
        }
    }
    result.append(new_item)

# To save to a file:
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)




### Uploading Dataset to huggingface

In [None]:
from huggingface_hub import HfApi, login
login(token="")
api = HfApi()
api.upload_large_folder(
    folder_path="freeform-table-qa/",
    repo_id="TableSenseAI/FreeformTableQA",
    repo_type="dataset",
)

  from .autonotebook import tqdm as notebook_tqdm
Recovering from metadata files: 100%|██████████| 30998/30998 [00:04<00:00, 6423.39it/s]





---------- 2025-06-23 11:55:53 (0:00:00) ----------
Files:   hashed 8/30998 (20.1M/66.2M) | pre-uploaded: 1/1 (14.3M/66.2M) (+30995 unsure) | committed: 3/30998 (19.6M/66.2M) | ignored: 0
Workers: hashing: 9 | get upload mode: 1 | pre-uploading: 0 | committing: 0 | waiting: 0
---------------------------------------------------
[K[F
---------- 2025-06-23 11:56:53 (0:01:00) ----------
Files:   hashed 30998/30998 (66.2M/66.2M) | pre-uploaded: 1/1 (14.3M/66.2M) | committed: 678/30998 (25.0M/66.2M) | ignored: 0
Workers: hashing: 0 | get upload mode: 0 | pre-uploading: 0 | committing: 1 | waiting: 9
---------------------------------------------------
[K[F                       
---------- 2025-06-23 11:57:53 (0:02:00) ----------
Files:   hashed 30998/30998 (66.2M/66.2M) | pre-uploaded: 1/1 (14.3M/66.2M) | committed: 2553/30998 (26.7M/66.2M) | ignored: 0
Workers: hashing: 0 | get upload mode: 0 | pre-uploading: 0 | committing: 1 | waiting: 9
--------------------------------------------