In [28]:
import pandas as pd
import pickle
from tqdm import tqdm
root = ''
root_git = root+'/GitTables/'
root_wiki = root+'/WikiTables/'
tmp_dir = ''

In [13]:
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sns

def plot_data_distribution(df_path: str | pd.DataFrame, label: str='a%', label_y: str='n_samples') -> None:
    """Given a labelled dataset print the data distribution of its samples

    Args:
        df_path (str | pd.DataFrame): path to the labelled dataframe or the dataframe.
        label (str, optional): label of the column to analyze. Defaults to 'a%'.
        label_y (str, optional): label of the y axis of the chart. Defaults to 'n_samples'.
    """
    if isinstance(df_path, str):
        data = pd.read_csv(df_path)
    else:
        data = df_path
    d = {}
    for i in range(1, 11, 1):
        i /= 10
        prev = round(i-0.1, 2)
        t = data[data[label] >= prev]
        t = t[t[label] < i]
        d[f'{prev}_{i}'] = t.shape[0]
    ##
    t = data[data[label] == 1]
    d['0.9_1.0']+=t.shape[0]
    ## 
    keys = list(d.keys())
    values = list(d.values())
    
    bar_width = 0.5
    
    # Create the bar plot
    plt.bar(keys, values, width=bar_width, color='grey')
    
    for i, v in enumerate(values):
        plt.text(i, v + 0.5, str(v), ha='center', va='bottom')
    
    plt.xticks(ha='center', fontsize=8)  # Ruota le etichette sull'asse x di 45 gradi
    plt.subplots_adjust(bottom=0.2) 
    
    # Adding labels and title
    plt.xlabel(f'{label} Range')
    plt.ylabel(label_y)

    # Show the plot
    plt.show()

In [14]:
def get_1_0_cols_rows_tables(td: dict[str:pd.DataFrame]) -> set[list, list]:
    zero_cols_rows = []
    one_col_row = []
    for k in tqdm(td.keys()):
        t = td[k]
        if (t.shape[0] == 1) or (t.shape[1] == 1):
            one_col_row.append(k)
        elif (t.shape[0] == 0) or (t.shape[1] == 0):
            zero_cols_rows.append(k)
    return zero_cols_rows, one_col_row

def find_influenced_samples(triples: pd.DataFrame, one_column_tables: list[str]) -> pd.DataFrame:
    indexes = []
    one_column_tables = set(one_column_tables)
    for r in tqdm(range(triples.shape[0])):
        if (triples.iloc[r,0] in one_column_tables) or (triples.iloc[r,1] in one_column_tables):
            indexes.append(r)
    return triples.iloc[indexes, :]

# Gittables

## Tables

In [3]:
with open(root+'/gittables/dictionaries/table_dict.pkl', 'rb') as f:
    td_gittables = pickle.load(f)

In [None]:
zero_git, one_git = get_1_0_cols_rows_tables(td_gittables)

In [None]:
len(zero_git)

In [None]:
len(one_git)

In [None]:
len(td_gittables.keys())

Almost 4% of the tables have a single column, 10_284 tables over 256_834

In [8]:
zero_rows = []
for k in one_git:
    if td_gittables[k].shape[0] == 1:
        zero_rows.append(k)

In [None]:
len(zero_rows)

## Influenced samples

In [None]:
train_gittables = pd.read_csv(root+'/gittables/train.csv')
test_gittables = pd.read_csv(root+'/gittables/test.csv')
valid_gittables = pd.read_csv(root+'/gittables/valid.csv')

influenced_train_git = find_influenced_samples(train_gittables, one_git)
influenced_test_git = find_influenced_samples(test_gittables, one_git)
influenced_valid_git = find_influenced_samples(valid_gittables, one_git)

print(f'Train: {len(influenced_train_git)}/{train_gittables.shape[0]}\nTest: {len(influenced_test_git)}/{test_gittables.shape[0]}\nValid: {len(influenced_valid_git)}/{valid_gittables.shape[0]}')

In [None]:
plot_data_distribution(influenced_train_git)

In [None]:
plot_data_distribution(influenced_test_git)

In [None]:
plot_data_distribution(influenced_valid_git)

# Wikilast

In [11]:
with open(root+'/wikilast/dictionaries/table_dict.pkl', 'rb') as f:
    td_wikilast = pickle.load(f)

In [None]:
zero_wiki, one_wiki = get_1_0_cols_rows_tables(td_wikilast)

In [None]:
len(zero_wiki)

In [None]:
len(one_wiki)

In [None]:
zero_rows_wiki = []
for k in one_wiki:
    if td_wikilast[k].shape[0] == 1:
        zero_rows_wiki.append(k)
len(zero_rows_wiki)

## Influenced samples

In [None]:
train_wikilast = pd.read_csv(root+'/wikilast/train.csv')
test_wikilast = pd.read_csv(root+'/wikilast/test.csv')
valid_wikilast = pd.read_csv(root+'/wikilast/valid.csv')

influenced_train_wiki = find_influenced_samples(train_wikilast, one_wiki)
influenced_test_wiki = find_influenced_samples(test_wikilast, one_wiki)
influenced_valid_wiki = find_influenced_samples(valid_wikilast, one_wiki)

print(f'Train: {len(influenced_train_wiki)}/{train_wikilast.shape[0]}\nTest: {len(influenced_test_wiki)}/{test_wikilast.shape[0]}\nValid: {len(influenced_valid_wiki)}/{valid_wikilast.shape[0]}')

In [None]:
print(len(td_wikilast))
for r in tqdm(range(valid_wikilast.shape[0])):
    td_wikilast[train_wikilast.iloc[r,0]]
    td_wikilast[train_wikilast.iloc[r,1]]

In [None]:
plot_data_distribution(influenced_train_wiki)

In [None]:
plot_data_distribution(influenced_test_wiki)

In [None]:
plot_data_distribution(influenced_valid_wiki)

# Unusued tables identification wikilast

In [2]:
def get_tables_list(df = pd.DataFrame) -> list:
    out = []
    for r in tqdm(range(df.shape[0])):
        out.append(df.iloc[r,0])
        out.append(df.iloc[r,1])
    return list(set(out))

In [None]:
import pandas as pd
import pickle
from tqdm import tqdm

train_wikilast = pd.read_csv(root+'/wikilast/train.csv')
test_wikilast = pd.read_csv(root+'/wikilast/test.csv')
valid_wikilast = pd.read_csv(root+'/wikilast/valid.csv')

train_tables_wiki = get_tables_list(train_wikilast)
test_tables_wiki = get_tables_list(test_wikilast)
valid_tables_wiki = get_tables_list(valid_wikilast)

all_tables_wiki = set(train_tables_wiki+test_tables_wiki+valid_tables_wiki)

print(f'Total number:{len(all_tables_wiki)}\nTrain:{len(train_tables_wiki)}\nTest:{len(test_tables_wiki)}\nValid:{len(valid_tables_wiki)}')



In [7]:
with open(root+'/wikilast/dictionaries/table_dict.pkl', 'rb') as f:
    td_wikilast = pickle.load(f)

In [None]:
len(td_wikilast.keys())

In [None]:
for k in tqdm(all_tables_wiki):
    td_wikilast[k].to_csv(root+'/wikilast/csv_minimal/'+k, index=False)

In [35]:
import os

filenames = os.listdir(tmp_dir+'/out/wikilast')

In [None]:
len(filenames)

In [None]:
type(filenames)

In [None]:
len(set(filenames))

In [1]:
import pandas as pd
tt = pd.read_csv(root+'/wikilast/train.csv')

In [None]:
tt

In [8]:
import pickle
with open(root+'/gittables/dictionaries/embedding_dictionaries/emb_dict_bert_lines_300_300.pkl', 'rb') as f:
    ed = pickle.load(f)

In [None]:
ed

In [None]:
len(ed.keys())

In [1]:
import pickle
with open(root+'/wikilast/dictionaries/embedding_dictionaries/emb_dict_turl_tables_300_300.pkl', 'rb') as f:
    ed = pickle.load(f)

In [2]:
nones = []
for k in ed.keys():
    if ed[k] == None:
        nones.append(k)

In [None]:
len(nones)

In [None]:
keys = list(ed.keys())
ed[keys[0]].shape

In [6]:
with open(root+'/gittables/dictionaries/table_dictionaries/table_dict.pkl', 'rb') as f:
    td = pickle.load(f)

In [7]:
lk = list(td.keys())

In [None]:
td[lk[121500]]

In [1]:
import pandas as pd

In [2]:
dd = pd.read_csv(root+'/wikilast/test.csv')

In [None]:
dd.head()

In [15]:
t1 = pd.read_csv(root+'/wikilast/csv/198.7650.csv')

In [None]:
t1

In [5]:
import pickle
with open(root+'/wikilast/dictionaries/table_dictionaries/table_dict.pkl', 'rb') as f:
    td = pickle.load(f)

In [None]:
td['198.7650.csv']

In [None]:
t1.shape

In [19]:
l_t1 = []
for c in range(t1.shape[1]):
    l_t1.append(list(t1.iloc[:,c]))

In [None]:
len(l_t1)

In [None]:
type(l_t1)

In [1]:
import pandas as pd

In [2]:
dw = pd.read_csv(root+'/wikilast/tmp/wrong_labels_test.csv')

In [None]:
dw

In [3]:
dg = pd.read_csv(root+'/gittables/tmp/wrong_labels_test.csv')

In [1]:
import pandas as pd
pp = pd.read_csv(root+'/gittables/tmp/wrong_labels_test.csv')

In [None]:
pp

In [3]:
dd = pd.read_csv(root+'/wikilast/tmp/wrong_labels_test.csv')

In [None]:
dd

In [1]:
import pandas as pd
t1 = pd.read_csv(root+'/wikilast/csv/123.66036.csv')
t2 = pd.read_csv(root+'/wikilast/csv/123.66050.csv')

In [None]:
t1 = pd.read_csv(root+'/wikilast/csv/123.66036.csv')
t1

In [None]:
t2 = pd.read_csv(root+'/wikilast/csv/123.66050.csv')
print((t2.iloc[1,0]))
print(type(t2.iloc[1,0]))
t2

In [None]:
tk = table_dict['123.66050.csv']

tk

In [None]:
print(type(tk.iloc[1,0]))

In [None]:
type(pd.NA)

In [1]:
import pandas as pd
from tqdm import tqdm

def remap_names(df):
    new = []
    for r in tqdm(range(df.shape[0])):
        old = df.iloc[r].iloc[0]
        new.append(str(old)+'.csv')
    out = df.copy()
    out['_id'] = new
    return out


In [None]:
train_metadata = root+'/WikiTables/train/metadata_train.csv'
train_metadata_out = root+'/WikiTables/train/metadata.csv'
test_metadata = root+'/WikiTables/test/metadata_test.csv'
test_metadata_out = root+'/WikiTables/test/metadata.csv'
valid_metadata = root+'/WikiTables/valid/metadata_valid.csv'
valid_metadata_out = root+'/WikiTables/valid/metadata.csv'

remap_names(pd.read_csv(train_metadata, dtype=str)).to_csv(train_metadata_out, index=False)
remap_names(pd.read_csv(test_metadata, dtype=str)).to_csv(test_metadata_out, index=False)
remap_names(pd.read_csv(valid_metadata, dtype=str)).to_csv(valid_metadata_out, index=False)

In [39]:
meta = pd.read_csv(root+'/WikiTables/test/metadata.csv')

In [None]:
meta

In [None]:
pd.read_csv(root+'/WikiTables/test/tables/81.46316.csv', dtype=str, header=None, skiprows=2)

In [1]:
import pandas as pd

In [4]:
dd = pd.read_csv(root+'/WikiTables/charts/effe_effi_wikilast.csv')

In [None]:
dd

In [None]:
dd['armadillo_wikilast_AE'].mean()

In [1]:
import pandas as pd
dd = pd.read_csv(root+'/WikiTables/train.csv')

In [3]:
import pickle
with open(root+'/WikiTables/dictionaries/table_dict.pkl', 'rb') as f:
    tt = pickle.load(f)

In [None]:
dd.iloc[219]

In [None]:
tt['213.49660.csv']

In [None]:
tt['616.55267.csv']

In [None]:
dd.iloc[218]

In [None]:
tt['429.158740.csv']

In [None]:
tt['429.98504.csv']

In [None]:
import pandas as pd
dd = pd.read_csv(root+'/WikiTables/test.csv')
dd

In [None]:
dd['jsim_time'].sum()

In [4]:
dd = pd.read_csv(root+'/WikiTables/train.csv')

In [13]:
l = [(0,50_000),(50_000,100_000),(100_000,150_000),(150_000,200_000),(200_000,250_000),(250_000,300_000),(300_000,350_000),(350_000,400_000),(400_000,450_000),(450_000,500_000)]
fff = []
for i in range(len(l)):
    fff.append(dd.iloc[l[i][0]:l[i][1]])
    

In [None]:
fff

In [None]:
fff[9]

In [None]:
fff[8]

In [None]:
len(fff)

In [None]:
fff[9]

In [None]:
str((0,50_000))

In [30]:
import pickle

In [31]:
with open(root+'/WikiTables/dictionaries/embedding_dictionaries/embdi/embedding_dict_mockup.pkl','rb') as f:
    ed = pickle.load(f)

In [32]:
with open(root+'/WikiTables/dictionaries/embedding_dictionaries/embdi/t_execs_mockup.pkl','rb') as f:
    td = pickle.load(f)

In [None]:
ed

In [None]:
ed.keys()

In [None]:
td['451.108746.csv|482.134731.csv']

In [1]:
import pickle

In [2]:
with open(tmp_dir+'/test_emb_exec_t/emb_dict_bert_lines_300_300.pkl', 'rb') as f:
    ed = pickle.load(f)

with open(tmp_dir+'/test_emb_exec_t/t_execs_bert_rows_300_300_gittables.pkl', 'rb') as f:
    ted = pickle.load(f)

In [None]:
ted

In [None]:
ed['437.47993.csv'].shape

In [3]:
import pickle
with open(root+'/WikiTables/dictionaries/embedding_dictionaries/emb_dict_embdi.pkl', 'rb') as f:
    ed = pickle.load(f)

with open(root+'/WikiTables/dictionaries/embedding_dictionaries/t_execs_embdi.pkl', 'rb') as f:
    ted = pickle.load(f)

In [4]:
import pandas as pd
train = pd.read_csv(root+'/WikiTables/train.csv')
test = pd.read_csv(root+'/WikiTables/test.csv')
valid = pd.read_csv(root+'/WikiTables/valid.csv')

In [5]:
from tqdm import tqdm
def checkkk(d, s):
    arg = []
    for r in tqdm(range(d.shape[0])):
        t = d.iloc[r]
        k = f'{t.loc["r_id"]}|{t.loc["s_id"]}'
        if k not in s:
            arg.append(k)
    return arg

In [None]:
len(set(list(ed.keys())))

In [None]:
len(list(ted.keys()))

In [8]:
import pandas as pd
train = pd.read_csv(root+'/WikiTables/train.csv')
test = pd.read_csv(root+'/WikiTables/test.csv')
valid = pd.read_csv(root+'/WikiTables/valid.csv')

In [None]:
train_bd = checkkk(train, set(list(ed.keys())))
test_bd = checkkk(test, set(list(ed.keys())))
valid_bd = checkkk(valid, set(list(ed.keys())))

In [None]:
train_bd

In [None]:
test_bd

In [None]:
valid_bd

In [1]:
import pickle

In [2]:
with open(root+'/WikiTables/charts/embedding_gen_time_wikilast.pkl','rb') as f:
    dfd = pickle.load(f)

In [None]:
dfd

In [4]:
with open(root+'/WikiTables/charts/embedding_file_wiki_on_wiki.pkl','rb') as f:
    dkd = pickle.load(f)

In [None]:
type(dkd['437.47993.csv'])

In [9]:
import pandas as pd
res = pd.read_csv(root+'/WikiTables/evalutation/eval_wiki.csv')

In [None]:
res.describe()

In [None]:
res.columns

In [None]:
res.describe()[['sloth_time','jsim_time','armadillo_wikilast_overlap_time','embdi_wikilast_overlap_time','turl_wikilast_overlap_time','bert_tables_300_300_wikilast_overlap_time','bert_tables_anon_300_300_wikilast_overlap_time','roberta_tables_300_300_wikilast_overlap_time','roberta_tables_anon_300_300_wikilast_overlap_time','bert_rows_300_300_wikilast_overlap_time','roberta_rows_300_300_wikilast_overlap_time','jsim_AE','armadillo_wikilast_ae', 'embdi_wikilast_ae','turl_wikilast_ae','bert_tables_300_300_wikilast_ae','bert_tables_anon_300_300_wikilast_ae','roberta_tables_300_300_wikilast_ae','roberta_tables_anon_300_300_wikilast_ae','bert_rows_300_300_wikilast_ae','roberta_rows_300_300_wikilast_ae']]