In [35]:
import sys
sys.path.append('../mixsc')

from mixsc.dataset.wikitq import WikiTQDataset
from mixsc.dataset.tabfact import TabFactDataset
from mixsc.dataset.tabmwp import TabMWPDataset

In [39]:
wtq = WikiTQDataset()
tabfact = TabFactDataset()
# tabmwp = TabMWPDataset()

In [43]:
index = 100 # 34 36 50
el = tabfact.get_item(index)
print(el['question'])
print(el['answer'])
el['table']

Answer whether the following statement is true or false: there be a 74.64 point difference between the highest score (186.92) and the lowest score (112.28)
true


Unnamed: 0,rank,name,nation,sp + fs,points,places
0,1,linda fratianne,united states,1,186.92,11
1,2,anett pötzsch,east germany,3,184.36,18
2,3,emi watanabe,japan,4,180.52,31
3,4,dagmar lurz,west germany,6,179.96,33
4,5,denise biellmann,switzerland,2,177.28,49
5,6,lisa - marie allen,united states,5,176.68,54
6,7,claudia kristofics - binder,austria,7,175.44,63
7,8,susanna driano,italy,9,173.46,70
8,9,carola weißenberg,east germany,11,170.54,88
9,10,kristiina wegelius,finland,15,169.26,98


In [13]:
ORDINAL_HASHMAP = {
    'first': 1,
    'second': 2,
    'third': 3,
    'fourth': 4,
    'fifth': 5,
    'sixth': 6,
    'seventh': 7,
    'eighth': 8,
    'ninth': 9,
    'tenth': 10, 
    'eleventh': 11,
    'twelfth': 12,
    'thirteenth': 13,
    'fourteenth': 14,
    'fifteenth': 15,
    'sixteenth': 16,
    'seventeenth': 17,
    'eighteenth': 18,
    'nineteenth': 19,
    'twentieth': 20
}

SYMBOL_HASHMAP = {
    '%': 'percent',
    '$': 'dollar',
    '°': 'degree',
    '°C': 'degree celsius',
    '°F': 'degree fahrenheit',
    'ft': 'feet',
    'lb': 'pound',
    'oz': 'ounce',
    'gal': 'gallon',
    'in': 'inch',
    'mi': 'mile',
    'mph': 'mile per hour',
    'km': 'kilometer',
}

In [14]:
import re
import pandas as pd
from datetime import datetime
from dateutil import parser as dparser
import yaml

from babel.dates import format_date # for date formatting
from quantulum3 import parser as qparser # for unit conversion
import pint # for unit conversion
import inflect # for ordinal conversion
from num2words import num2words # for number-word conversion
from word2number import w2n # for word-number conversion

class TableFormatter:
    def __init__(self, df: pd.DataFrame, locale='en_US'):
        self.df = df.copy()
        self.locale = locale
        self.inflect_engine = inflect.engine()
        self.ureg = pint.UnitRegistry()

        self.ORDINAL_WORDS = {
            'first': 1,
            'second': 2,
            'third': 3,
            'fourth': 4,
            'fifth': 5,
            'sixth': 6,
            'seventh': 7,
            'eighth': 8,
            'ninth': 9,
            'tenth': 10,
        }

    def date_conversion(self, value, date_format):
        if not date_format:
            return value
        if isinstance(value, str):
            # chcek if value is a date
            try:
                date = dparser.parse(value, fuzzy=False)
                return format_date(date, format=date_format, locale=self.locale)
            except:
                return value
        return value
            

    def num_word_conversion(self, value, forward):
        if forward:
            # forward: number -> word
            if isinstance(value, (int, float)):
                return num2words(value, lang='en')
        else:
            # reverse: word -> number
            if isinstance(value, str):
                val_lower = value.lower().strip()
                if val_lower.startswith('minus '):
                    number_part = val_lower.replace('minus ', '')
                    try:
                        num = w2n.word_to_num(number_part)
                        return -num
                    except:
                        return value
                else:
                    try:
                        return w2n.word_to_num(val_lower)
                    except:
                        return value
        return value

    def ordinal_conversion(self, value, forward):
        if forward:
            # '1st' -> 'first'
            if isinstance(value, str):
                match = re.match(r'^(\d+)(st|nd|rd|th)$', value)
                if match:
                    num = int(match.group(1))
                    return num2words(num, to='ordinal', lang='en')
        else:
            # 'first' -> '1st'
            if isinstance(value, str):
                val_lower = value.lower()
                if val_lower in self.ORDINAL_WORDS:
                    num = self.ORDINAL_WORDS[val_lower]
                    return num2words(num, to='ordinal_num')
        return value

    def symbol_conversion(self, value, forward):
        if forward:
            # forward: %->percent, $->dollars, - -> minus(단독)
            if isinstance(value, str):
                for symbol, word in SYMBOL_HASHMAP.items():
                    value = value.replace(symbol, word)
                if value.strip() == '-':
                    value = 'minus'
        else:
            # reverse: percent->%, dollars->$, minus->-
            if isinstance(value, str):
                if value.strip() == 'minus':
                    return '-'
                for symbol, word in SYMBOL_HASHMAP.items():
                    value = value.replace(word, symbol)
        return value

    def unit_conversion(self, value, unit_map):
        """
        unit_map 예:
        {
            '[length]': 'cm',
            '[mass]': 'kg'
        }
        해당 차원의 물리량을 unit_map에 지정된 단위로 변환.
        """

        if isinstance(value, str):
            quants = qparser.parse(value)
            if quants:
                quant = quants[0]
                val = quant.value
                unit_name = quant.unit.name
                try:
                    q = val * self.ureg.parse_units(unit_name)
                    # unit_map 키들 중 q가 만족하는 차원 찾아 변환
                    for dim_key, target_unit in unit_map.items():
                        if q.check(dim_key):
                            q_new = q.to(self.ureg.parse_units(target_unit))
                            return f"{q_new.magnitude} {target_unit}"
                except:
                    # 변환 실패 시 원본 값 반환
                    return value
        return value

    def scientific_conversion(self, value, forward):
        if forward:
            # forward: 1000->1e3
            if isinstance(value, (int, float)):
                return f"{value:.0e}"
        else:
            # reverse: '1e3'->1000, '1e-3'->0.001
            if isinstance(value, str):
                try:
                    return float(value)
                except:
                    return value
        return value

    def transform(self, config):
        def apply_pipeline(val):
            if 'ordinal' in config: # 1st -> first
                val = self.ordinal_conversion(val, config['ordinal'])
            if 'symbol' in config: # %, $ -> percent, dollars
                val = self.symbol_conversion(val, config['symbol'])
            if 'unit' in config: # kg <-> g 
                val = self.unit_conversion(val, config['unit'])
            if 'date_format' in config:
                val = self.date_conversion(val, config['date_format'])
            if 'num_word' in config: # 100 -> one hundred
                val = self.num_word_conversion(val, config['num_word'])
            if 'scientific' in config: # 1000 -> 1e3
                val = self.scientific_conversion(val, config['scientific'])
            return val

        for col in self.df.columns:
            self.df[col] = self.df[col].apply(apply_pipeline)
        return self.df

# config 예시
# date_format: 'dd MMMM yyyy'
# num_word: true
# ordinal: true
# symbol: true
# unit:
#   '[length]': 'cm'
#   '[mass]': 'kg'
#   '[time]': 's'
#   '[temperature]': 'K'
# scientific: true


if __name__ == "__main__":
    # config.yaml 로드
    with open('config.yaml', 'r') as f:
        config = yaml.safe_load(f)
        
    print(config)

    data = {
        'A': [-2, '2021-05-17', '100%', '1st'],
        'B': ['1.85 m', '5$', '1e-4', '3t']
    }
    
    df = pd.DataFrame(data)
    print(df)
    print('---------------------------------')
    tf = TableFormatter(df)
    df_transformed = tf.transform(config)
    
    print(df_transformed)


{'date_format': 'dd MMMM yyyy', 'num_word': True, 'ordinal': True, 'symbol': True, 'unit': {'[length]': 'cm', '[mass]': 'kg', '[time]': 's', '[temperature]': 'K'}, 'scientific': False}
            A       B
0          -2  1.85 m
1  2021-05-17      5$
2        100%    1e-4
3         1st      3t
---------------------------------
             A          B
0    minus two   185.0 cm
1  17 May 2021    5dollar
2   100percent     0.0001
3        first  3000.0 kg
