# WikiTableQuestions

### Uploading Dataset to huggingface


In [None]:
import os
from dotenv import load_dotenv

from huggingface_hub import HfApi

# Load the .env file
load_dotenv()

# Get the token
hf_token = os.getenv("HF_TOKEN")

api = HfApi(token=hf_token)
api.upload_large_folder(
    folder_path="tabmwp/",
    repo_id="TableSenseAI/TabMWP",
    repo_type="dataset",
)


### Preprocessing Dataset

In [None]:
import csv
import json
import os

input_tsv = 'WikiTableQuestions-data/data/pristine-unseen-tables.tsv'         # Replace with your input file
output_json = 'examples-test.json'

data = []

with open(input_tsv, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for idx, row in enumerate(reader):
        base_path, _ = os.path.splitext(row["context"])
        entry = {
            "id": f"nu-{idx}",
            "utterance": row["utterance"],
            "target_value": row["targetValue"],
            "context": {
                "csv": base_path + ".csv",
                "html": base_path + ".html",
                "tsv": base_path + ".tsv"
            }
        }
        data.append(entry)

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)

print(f"Wrote {len(data)} entries to {output_json}")


### TabMWP Preprocessing

In [None]:
import json
import pandas as pd
import os

# input_json = 'C:/Users/Marco/workspace/TabMWP/PromptPG/data/tabmwp/problems_train.json'         # Replace with your input file
# output_json = 'tabmwp/examples-train.json'
input_json1 = 'C:/Users/Marco/workspace/TabMWP/PromptPG/data/tabmwp/problems_dev.json'         # Replace with your input file
input_json2 = 'C:/Users/Marco/workspace/TabMWP/PromptPG/data/tabmwp/problems_test.json' 
output_json = 'tabmwp/examples-test.json'

data = []
directory_id = 7
table_id = 0
location = 'tabmwp/'

# with open (input_json, 'r') as f:
#     orig = json.load(f)
with open(input_json1, 'r') as f1, open(input_json2, 'r') as f2:
    in1 = json.load(f1)
    in2 = json.load(f2)
    orig = {**in1, **in2}

    for key, value in orig.items():
        df = pd.DataFrame(value["table_for_pd"])
        basepath = f"csv-{directory_id}/{table_id}"
        dirpath = os.path.dirname(basepath)
        if not os.path.isdir(location + dirpath):
            os.makedirs(location + dirpath, exist_ok=True)

        if table_id % 3333 == 0 and table_id != 0:
            directory_id += 1
            table_id = 0
            basepath = f"csv-{directory_id}/{table_id}"
            dirpath = os.path.dirname(basepath)
            if not os.path.isdir(location + dirpath):
                os.makedirs(location + dirpath, exist_ok=True)
        df.to_csv(location+ basepath + ".csv", index=False)
        df.to_html(location+basepath+".html", index=False)
        df.to_csv(location+basepath+".tsv", sep="\t", index=False)
        
        entry = {
            "id": f"nu-{key}",
            "utterance": value["question"],
            "target_value": value["answer"],
            "context": {
                "csv": basepath+ ".csv",
                "html": basepath+".html",
                "tsv": basepath+".tsv"
            }
        }
        data.append(entry)
        table_id += 1

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)

print(f"Wrote {len(data)} entries to {output_json}")