In [1]:
"""
Author: Chao Yan (ceewye@outlook.com)
preprocess.ipynb (c) 2022
Desc: transform data
Created:  2022-08-05T07:32:46.964Z
"""

import pandas as pd
import json
import os
from valid import *

In [2]:


def csv_to_dict(filename):
    demo_df = pd.read_csv(filename, sep='\t', error_bad_lines=False)
    tables = {}
    for fn, df_fn in demo_df.groupby('FileName'):
        for tid, df_fn_tid in df_fn.groupby('TableId'):
            mat = []
            
            for rid, df_fn_tid_rid in df_fn_tid.groupby('TableRowId'):
                row = [s[:-1] for s in df_fn_tid_rid['CellText']]
                mat.append(row)

            if not is_mat(mat): continue
                
            tables[f'{fn}-table_{tid}'] = {'table_array': mat}
    return tables
        

In [6]:
splits = ['train', 'valid', 'test']

fname = 'tables.json'
if not os.path.exists(fname):
    tables = {}
    for s in splits:
        tables.update(csv_to_dict(f'./raw/csv/{s}.csv'))
    with open(fname, 'w') as f: json.dump(tables, f)
else:
    with open(fname) as f: tables = json.load(f)

In [4]:
train_labs = pd.read_csv('raw/label/train.csv', sep=',')
train_labs

Unnamed: 0,file_name,table_id,detail_label
0,Sample-Document-Retention-Policy-12.31.15.docx,0,Layout
1,Sample-Document-Retention-Policy-12.31.15.docx,1,SimpleHeadersSimpleContent
2,RenUnitOutSS8-2014.docx,0,ComplexListSimpleContent
3,micc_contr_guide.docx,0,SimpleHeadersSimpleContent
4,INF2.docx,1,Layout
...,...,...,...
9132,2+VfM+NSMC+Background+documents.docx,10,SimpleHeadersSimpleContent
9133,2+VfM+NSMC+Background+documents.docx,11,SimpleHeadersSimpleContent
9134,2+VfM+NSMC+Background+documents.docx,0,SimpleHeadersSimpleContent
9135,nomination-examiners-research.docx,0,ComplexFormComplexContent


In [5]:
a = ['SimpleFormComplexContent',
 'SimpleFormSimpleContent',
 'SimpleHeadersComplexContent',
 'SimpleHeadersSimpleContent',
 'SimpleListComplexContent',
 'SimpleListSimpleContent',
 'ComplexFormComplexContent',
 'ComplexFormSimpleContent',
 'ComplexListComplexContent',
 'ComplexListSimpleContent',
 'InternalHeadersSimpleContent',
 'InternalHeadersComplexContent',
 'Layout',
 'Other']
# ids = dict(zip(a, range(len(a))))
ids = {k : 1 if k in ["SimpleHeadersSimpleContent", "SimpleHeadersComplexContent"] else 0 for k in a}
ids

{'SimpleFormComplexContent': 0,
 'SimpleFormSimpleContent': 0,
 'SimpleHeadersComplexContent': 1,
 'SimpleHeadersSimpleContent': 1,
 'SimpleListComplexContent': 0,
 'SimpleListSimpleContent': 0,
 'ComplexFormComplexContent': 0,
 'ComplexFormSimpleContent': 0,
 'ComplexListComplexContent': 0,
 'ComplexListSimpleContent': 0,
 'InternalHeadersSimpleContent': 0,
 'InternalHeadersComplexContent': 0,
 'Layout': 0,
 'Other': 0}

In [7]:
def get_id(row):
    return (f'{row.file_name}-table_{row.table_id}', ids[row.detail_label])
def parse_labels(df):
    return dict(list(df.apply(get_id, axis=1)))

In [8]:
qtrel = []
tab_to_split_id = {}
out_cnt = 0
for i, s in enumerate(splits):
    with open(f'{s}_query.txt', 'w') as f: f.write(f'{i}\t{s}\n')
    tabs = pd.read_csv(f'raw/label/{s}.csv', sep=',')
    tabs = parse_labels(tabs)
    print(f'#{s} = {len(tabs)}')
    for tab, lab in tabs.items():
        if tab not in tables: 
            out_cnt += 1
        else:
            qtrel.append([i, 0, tab, lab])
print(f'#out = {out_cnt}')

#train = 9137
#valid = 1322
#test = 2747
#out = 6850


In [9]:
len(qtrel), len(tables)

(6356, 6356)

In [10]:
qtrel_df = pd.DataFrame.from_records(qtrel)

In [11]:
qtrel_df.to_csv('qtrels.txt', sep='\t', header=False, index=False)

In [12]:
sorted(qtrel_df[3].unique())

[0, 1]