In [1]:
import pickle as pkl 

In [2]:
datasets = ['wikitables', 'wikibio', 'tanq', 'rotowire']
models = ['llama', 'gpt', 'gemini']

In [3]:
def table_to_dict_list(table_string, suffix=""):
    table_string = table_string.replace("markdown", "")
    # Split the table into lines
    lines = table_string.strip().split('\n')

    # Find the first line that contains the table header (i.e., a line with '|')
    table_start_idx = next(
        (i for i, line in enumerate(lines) if '|' in line), None)

    # If no table is found, return an empty list
    if table_start_idx is None:
        return []

    # Process the header from the detected table start line
    header = lines[table_start_idx].strip().split('|')
    header = [col.strip() for col in header if col.strip()]

    # Prepare the list to hold dictionaries
    table_as_dicts = []

    # Loop through each data row, skipping any separator rows and stopping at ``` or blank lines
    for line in lines[table_start_idx + 1:]:
        # Stop processing if the table ends
        if '```' in line or not line.strip():
            break

        # Skip lines that contain only '---'
        if '---' in line:
            continue

        row_values = line.strip().split('|')
        row_values = [val.strip() for val in row_values if val.strip()]

        # Create a dictionary for the current row, ensuring to match header order with values
        row_dict = {}
        for i in range(len(header)):
            if i < len(row_values):
                row_dict[header[i]] = row_values[i]
            else:
                row_dict[header[i]] = None  # Fill with None if data is missing

        table_as_dicts.append(row_dict)

    return table_as_dicts

In [4]:
import pandas as pd

def get_avg_statistics(scores):
    all_dfs_cells = []
    all_dfs_rc = []
    bad = 0
    for data in scores:
        alignment = table_to_dict_list(data['alignment'])
        if not alignment:
            continue
        n_rows = len(alignment)
        n_cols = len(alignment[0])
        factor = 100 / (n_rows * n_cols)

        # Process cell_stats DataFrame
        df = data['cell_stats']
        numeric_cols = df.select_dtypes(include='number').columns
        df[numeric_cols] = df[numeric_cols].astype(float) * factor
        all_dfs_cells.append(df[numeric_cols].copy())

        # Process row_col_statistics DataFrame
        df_rc = data['row_col_statistics']
        numeric_cols_rc = df_rc.select_dtypes(include='number').columns
        df_rc[numeric_cols_rc] = df_rc[numeric_cols_rc].astype(float)
        df_rc.iloc[0, 1:] = df_rc.iloc[0, 1:] * 100 / n_rows  # Divide first row by n_rows
        df_rc.iloc[1, 1:] = df_rc.iloc[1, 1:] * 100 / n_cols  # Divide second row by n_cols
        all_dfs_rc.append(df_rc[numeric_cols_rc].copy())

    # Calculate average for cell_stats
    df_cells_sum = pd.concat(all_dfs_cells).groupby(level=0).sum()
    df_cells_avg = df_cells_sum / len(all_dfs_cells)

    # Calculate average for row_col_statistics
    df_rc_sum = pd.concat(all_dfs_rc).groupby(level=0).sum()
    df_rc_avg = df_rc_sum / len(all_dfs_rc)

    return df_cells_avg, df_rc_avg


In [5]:

for dataset in datasets:
    for model in models:
        try:
            print("-"*100)
            print(f"MODEL: {model} DATASET: {dataset}\n")
            with open(f'{model}/{dataset}/tabscore_final_modified.pkl', 'rb') as f:
                tabscore = pkl.load(f)
                # print("HI")
            df_cells, df_rc = get_avg_statistics(tabscore)
            print("CELL STATS\n")
            print(df_cells)
            
            print("\n\n")
            print("ROW COL STATS\n")
            print(df_rc)
            print("-"*100)
        except:
            pass

----------------------------------------------------------------------------------------------------
MODEL: llama DATASET: wikitables

CELL STATS

         Numerical     String  Bool      Date  List      Time   Others
EI        0.054107   4.504762   0.0  0.175074   0.0  0.000000  0.12755
MI        0.013333   0.806419   0.0  0.032292   0.0  0.000000  0.00000
Partial   0.217827  24.986072   0.0  0.350714   0.0  0.008125  0.08744



ROW COL STATS

         MI         EI         EM
0  8.788531  15.726743  25.590121
1  0.000000   0.000000   1.675000
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
MODEL: gpt DATASET: wikitables

CELL STATS

         Numerical     String  Bool      Date  List      Time    Others
EI        0.034583   1.185853   0.0  0.015000   0.0  0.000000  0.140417
MI        0.005000   1.093671   0.0  0.003571   0.0  0.0000

In [11]:
import json

In [31]:
with  open("/home/turning/Jainit/TANQ/EVALUATION_OF_MODELS/llama/wikitables/input_tables_for_score.json") as f:
    data = json.load(f)

In [37]:
row_differences = []
for t in data: 
    rows1 = len(data[t][0].split('\n'))
    rows2 = len(data[t][1].split('\n'))
    
    print("_"*100)
    print("\n\n")
    
    print(f"TABLE:\n{data[t][0]}")
    print("\n\n")
    print(f"TABLE:\n{data[t][1]}")
    print("\n\n")
    
    print(f"ROW DIFFERENCE: {abs(rows1 - rows2)}")
    print("\n\n")
    print("_"*100)
    row_differences.append(abs(rows1 - rows2)/max(rows1, rows2))
    

____________________________________________________________________________________________________



TABLE:
| Field    | Value                        |
|:---------|:-----------------------------|
| title    | portuguese heritage society  |
| subtitle | other activities             |
| name     | michelle schimel             |
| office   | new york state assemblywoman |



TABLE:
| Field    | Value                        |
|:---------|:-----------------------------|
| title    | portuguese heritage society  |
| subtitle | other activities             |
| name     | michelle schimel             |
| office   | new york state assemblywoman |



ROW DIFFERENCE: 0



____________________________________________________________________________________________________
____________________________________________________________________________________________________



TABLE:
| Field    | Value                    |
|:---------|:-------------------------|
| subtitle | football              

In [38]:
row_differences = sum(row_differences)/len(row_differences)

In [39]:
(row_differences)*100

12.49077110389599