In [None]:
!python3.9 create_train_valid_test_split.py

In [1]:
import os
import sys
sys.path.append("..")
from dotenv import load_dotenv
load_dotenv(override=True)
from glob import glob
from functools import reduce
import operator
import numpy as np

In [None]:
from SportsDB_data_loader import get_LabelEncoder

label_enc = get_LabelEncoder()

len(label_enc.classes_)

In [None]:
from SportsDB_data_loader import SportsTablesDGLDataset, convert_Table_to_dgl_graph, get_LabelEncoder, SportsTablesDGLDataset_enriched
from transformers import BertTokenizer, BertModel
import torch

model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = SportsTablesDGLDataset_enriched(tokenizer=tokenizer, sport_domains=["baseball"], max_length=10, shuffle_cols=False)
#valid_dataset = SportsTablesDGLDataset(tokenizer=tokenizer, sport_domains=["baseball"], split="valid")
#test_dataset = SportsTablesDGLDataset(tokenizer=tokenizer, sport_domains=["baseball"], split="test")

In [None]:
train_dataset.df

In [None]:
token_ids_list = train_dataset[0].ndata["data_tensor"].tolist()
token_ids = torch.LongTensor(reduce(operator.add, token_ids_list))
cls_index_list = torch.LongTensor([0] + np.cumsum(
    np.array([len(x) for x in token_ids_list])).tolist()[:-1])


In [None]:
token_ids_list

In [None]:
token_ids

In [None]:
from model.gcn import CA_GCN_Tablewise
import dgl

model = CA_GCN_Tablewise("bert-base-uncased", 512, 462)

g = train_dataset[0]
g = dgl.add_reverse_edges(g)

model(g)

# Create train, valid, test split for SportsTables

In [None]:
import os
from os.path import join
import sys
sys.path.append("..")

from glob import glob
from dotenv import load_dotenv
load_dotenv(override=True)
import pandas as pd
from sklearn.model_selection import train_test_split
import json

sport_domains = ["baseball", "basketball", "football", "hockey", "soccer"]
random_state = 1

for random_state in [1,2,3,4,5]:
    for sport_domain_idx, sport_domain in enumerate(sport_domains):
        table_list = glob(join(os.environ["SportsTables"], sport_domain, "*csv"))
        table_list = [table_id.split("/")[-1] for table_id in table_list]
        train, test = train_test_split(table_list, test_size=0.2, random_state=random_state)
        train, valid = train_test_split(train, test_size=0.2, random_state=random_state)
        print(sport_domain, len(table_list), len(train), len(valid), len(test))
        with open(join(os.environ["SportsTables"], sport_domain, f"train_valid_test_split_{random_state}.json"), "w") as f:
            json.dump({
                "train": train,
                "valid": valid,
                "test": test
            }, f)
     
    

In [None]:
len(train)

In [None]:
len(valid)

In [None]:
len(test)

# Special Numerical Column Feature Set

In [None]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)
import pandas as pd
from os.path import join
import json
from ast import literal_eval

sport_domains = ["baseball", "basketball", "football", "hockey", "soccer"]
random_state = 1
shuffle_cols = False

In [2]:
# load sherlock features
df_sherlock_features = pd.read_csv("/home/slangenecker/sato/extract/out/features/type_SportsTables/SportsTables_type_SportsTables_sherlock_features.csv")
with open(join(os.environ["MAIN_DIR"], "data_loader","valid_sherlock_features.json"), "r") as f:
    selected_sherlock_feature_set = json.load(f)
df_sherlock_features = df_sherlock_features[["locator", "dataset_id", "field_id", "header"]+selected_sherlock_feature_set]

# load additional numerical stats features
df_num_stats_features = pd.read_csv(join(os.environ["MAIN_DIR"],"data_loader", "features","SportsTables_numerical_statistic_features.csv"))

# load all tables from SportsTables, tokenize every table column as described in Doduo and build the graph representation
data_list_num_stats = []
for idx_sport_domain, sport_domain in enumerate(sport_domains):
    # if idx_sport_domain > 0:
    #     break
    # load metadata.json containing semantic types of the columns
    with open(join(os.environ["SportsTables"], sport_domain, "metadata.json")) as f:
        metadata = json.load(f)
    with open(join(os.environ["SportsTables"], sport_domain, f"train_valid_test_split_{random_state}.json")) as f:
        train_valid_test_split = json.load(f)

    for idx_table_path, table_name_full in enumerate(train_valid_test_split["train"]+train_valid_test_split["valid"]+train_valid_test_split["test"]):
        # if idx_table_path != 0:
        #     continue
        table_name = table_name_full.split("/")[-1].split(".csv")[0]
        ## search for correct in key in metadata
        table_metadata_key = None
        for key in metadata.keys():
            if key in table_name:
                table_metadata_key = key
        if table_metadata_key == None:
            print(f"CSV {table_name_full} not in metadata.json defined!")
            continue

        df = pd.read_csv(join(os.environ["SportsTables"], sport_domain, table_name_full))
        data_list = []
        if shuffle_cols:
            column_list = list(range(len(df.columns)))
            #random.seed(.random_state)
            random.shuffle(column_list)
        else:
            column_list = list(range(len(df.columns)))
            
        for i in column_list:
            # check if column is completely empty
            if len(df.iloc[:, i].dropna().tolist()) == 0:
                continue
            column_name = df.columns[i]
            # search for defined columns data type and semantic label in metadata
            if column_name in metadata[table_metadata_key]["textual_cols"].keys():
                column_data_type = 0 # => "textual"
                column_label = metadata[table_metadata_key]["textual_cols"][column_name]
            elif column_name in metadata[table_metadata_key]["numerical_cols"].keys():
                column_data_type = 1 # => "numerical"
                column_label = metadata[table_metadata_key]["numerical_cols"][column_name]
            else:
                print(f"Column {df.columns[i]} in {table_name} not labeled in metadata.json!")
                continue
            
            data_list.append([
                table_name,  # table name
                column_name,  # column name
                column_data_type,
                column_label,
                " ".join([str(x)
                            for x in df.iloc[:, i].dropna().tolist()]),
            ])
        df = pd.DataFrame(data_list, columns=[
                                "table_name", "column_name", "columns_data_type", "column_label", "data"])
        if len(df) == 0:
            print(f"Table {table_name} has no columns with assigned semantic types!")
            continue
              
        # build numerical features for each numerical table column
        num_features = []
        for idx, row in df.iterrows():
            if row['columns_data_type'] == 1: # "numerical":
                try:
                    sherlock_feature = df_sherlock_features.query(f"locator == '{sport_domain}' & dataset_id == '{row['table_name']}.csv' & header == '{row['column_name']}'")[selected_sherlock_feature_set].iloc[0].tolist()
                    num_stats_feature = df_num_stats_features.query(f"domain == '{sport_domain}' & table_name == '{row['table_name']}.csv' & column_name == '{row['column_name']}'")["features"].iloc[0]
                    try:
                        num_stats_feature = literal_eval(num_stats_feature)
                    except:
                       print(f"No num_feature for {row['column_name']} in table {row['table_name']}.csv")
                       data_list_num_stats.append([row['column_name'], row['table_name'] ])
                       print("Using feature vector filled with zeros")
                       num_stats_feature = [0]*15
                    num_features.append(sherlock_feature+num_stats_feature)
                except Exception as err:
                    print(err)
                    print(f"No sherlock or num_feature for {row['column_name']} in table {row['table_name']}.csv")
                    
pd.DataFrame(data_list_num_stats, columns=["col", "tab"]).to_csv("cols_no_num_stats_features_clean_empties.csv", index=False)

No num_feature for Attendance in table La-Liga_season_standings_2002.csv
Using feature vector filled with zeros
No num_feature for Min in table Ligue-1_player_stats_2020.csv
Using feature vector filled with zeros
No num_feature for Attendance in table Ligue-1_season_standings_2005.csv
Using feature vector filled with zeros
No num_feature for Attendance in table bundesliga_season_standings_2007.csv
Using feature vector filled with zeros
No num_feature for Min in table Premier-League_player_stats_2009.csv
Using feature vector filled with zeros
No num_feature for Attendance in table Ligue-1_season_standings_1999.csv
Using feature vector filled with zeros
No num_feature for Min in table Premier-League_player_stats_2005.csv
Using feature vector filled with zeros
No num_feature for Min in table La-Liga_player_stats_2009.csv
Using feature vector filled with zeros
No num_feature for Min in table Premier-League_player_stats_2021.csv
Using feature vector filled with zeros
No num_feature for Atte

In [None]:
df = pd.read_csv(join(os.environ["SportsTables"], "hockey", "nhl_playoff_player_goalie_stats_1989.csv"))
df

In [None]:
df.iloc[:, 5].dropna().tolist()

# Count Semantic Types in SportsTables

In [None]:
import json
from os.path import join
from dotenv import load_dotenv
load_dotenv(override=True)
import pandas as pd
from collections import Counter

sport_domains = ["baseball", "basketball", "football", "hockey", "soccer"]
random_state = 1

semantic_types = []

for idx_sport_domain, sport_domain in enumerate(sport_domains):
    # if idx_sport_domain > 0:
    #     break
    # load metadata.json containing semantic types of the columns
    with open(join(os.environ["SportsTables"], sport_domain, "metadata.json")) as f:
        metadata = json.load(f)
    with open(join(os.environ["SportsTables"], sport_domain, f"train_valid_test_split_{random_state}.json")) as f:
        train_valid_test_split = json.load(f)


    for idx_table_path, table_name_full in enumerate(train_valid_test_split["train"]+train_valid_test_split["valid"]+train_valid_test_split["test"]):
        # if idx_table_path != 1:
        #     continue
        table_name = table_name_full.split("/")[-1].split(".csv")[0]
        ## search for correct in key in metadata
        table_metadata_key = None
        for key in metadata.keys():
            if key in table_name:
                table_metadata_key = key
        if table_metadata_key == None:
            print(f"CSV {table_name_full} not in metadata.json defined!")
            continue

        df = pd.read_csv(join(os.environ["SportsTables"], sport_domain, table_name_full))
        column_list = list(range(len(df.columns)))
            
        for i in column_list:
            column_name = df.columns[i]
            # search for defined columns data type and semantic label in metadata
            if column_name in metadata[table_metadata_key]["textual_cols"].keys():
                column_data_type = "textual"
                column_label = metadata[table_metadata_key]["textual_cols"][column_name]
            elif column_name in metadata[table_metadata_key]["numerical_cols"].keys():
                column_data_type = "numerical"
                column_label = metadata[table_metadata_key]["numerical_cols"][column_name]
            else:
                print(f"Column {df.columns[i]} in {table_name} not labeled in metadata.json!")
                continue
            semantic_types.append(column_label)

df_sem_types = pd.DataFrame(Counter(semantic_types), index=["count"]).T.reset_index().sort_values(by="count", ascending=False).reset_index(drop=True)
df_sem_types.to_csv("SportsTable_semantic_type_occurrences.csv", index=False)

# Extract additional numerical features like min, max, mean, median ...

In [None]:
import random
import numpy as np
import json
import pandas as pd
from glob import glob
import os
from os.path import join
from dotenv import load_dotenv
load_dotenv(override=True)

sport_domains = ["baseball", "basketball", "football", "hockey", "soccer"]

result_data = []
for idx_sport_domain, sport_domain in enumerate(sport_domains):
    # if idx_sport_domain > 0:
    #     continue
    for idx_table_path, table_path in enumerate(glob(join(os.environ["SportsTables"], "baseball", "*.csv"))):
        # if idx_table_path > 2:
        #     continue
        df_table = pd.read_csv(table_path)

        # Calculate numerical statistics of the columns
        df_stats = df_table.describe(
            include="all", percentiles=[.1, .2, .3, .4, .5, .6, .7, .8, .9]).T
        data = []
        for column in df_stats.index:
            try:
                data.append(df_table[column].kurtosis())
            except:
                data.append(np.NaN)
        df_stats["kurtosis"] = data
        data = []
        for column in df_stats.index:
            try:
                data.append(df_table[column].skew())
            except Exception as err:
                data.append(np.NaN)
        df_stats["skewness"] = data

        for id_, row in df_stats.iterrows():
            result_data.append([sport_domain, table_path.split("/")[-1], id_, row[["mean", "std", "min", "10%", "20%",
                               "30%", "40%", "50%", "60%", "70%", "80%", "90%", "max", "kurtosis", "skewness"]].values.tolist()])
            
df_result = pd.DataFrame(result_data, columns=["domain", "table_name", "column_name", "features"])
df_result.to_csv("./features/SportsTables_numerical_statistic_features.csv", index=False)

In [None]:
df_result

# Building heterogeneous Graphs

<img src="https://data.dgl.ai/asset/image/user_guide_graphch_2.png" width="700">

In [None]:
import dgl
import torch as th
# Create a heterograph with 3 node types and 3 edges types.
graph_data = {
   ('drug', 'interacts', 'drug'): (th.tensor([0, 0, 1,2]), th.tensor([1, 2, 2,0])),
   #('drug', 'interacts', 'drug'): (th.tensor([2]), th.tensor([1])),
   ('drug', 'interacts', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),
   ('drug', 'treats', 'disease'): (th.tensor([1]), th.tensor([2]))
}
g = dgl.heterograph(graph_data)
g.ntypes
g.etypes
g

In [None]:
from SportsDB_data_loader import SportsTablesDGLDataset_enriched
from transformers import BertTokenizer, BertModel
import torch

model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = SportsTablesDGLDataset_enriched(tokenizer=tokenizer, sport_domains=["baseball"], max_length=10, shuffle_cols=False)

In [None]:
train_dataset[0]

In [None]:
len(train_dataset[0].nodes["num_feature_node"].data["data_tensor"][0])

In [None]:
## Heterogenous Graph of SportsTables
from sklearn.preprocessing import LabelEncoder
import dgl
from dgl.data import DGLDataset
import json
import pandas as pd
from glob import glob
import transformers
from torch.utils.data import Dataset
import torch
import os
from os.path import join
from dotenv import load_dotenv
load_dotenv(override=True)
from functools import reduce
import operator
import numpy as np
import random

from SportsDB_data_loader import get_LabelEncoder

label_enc = get_LabelEncoder()

from transformers import BertTokenizer

from ast import literal_eval

# Load tokenizer and initiate the NN model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# load sherlock features
df_sherlock_features = pd.read_csv("/home/slangenecker/sato/extract/out/features/type_SportsTables/SportsTables_type_SportsTables_sherlock_features.csv")
with open("valid_sherlock_features.json", "r") as f:
    selected_sherlock_feature_set = json.load(f)
df_sherlock_features = df_sherlock_features[["locator", "dataset_id", "field_id", "header"]+selected_sherlock_feature_set]

# load additional numerical stats features
df_num_stats_features = pd.read_csv("./features/SportsTables_numerical_statistic_features.csv")

with open(join(os.environ["SportsTables"], "basketball", "metadata.json")) as f:
    metadata = json.load(f)
with open(join(os.environ["SportsTables"], "basketball", f"train_valid_test_split_1.json")) as f:
    train_valid_test_split = json.load(f)
    
table_name = "nba_season_team_stats_2021"
for key in metadata.keys():
    if key in table_name:
        table_metadata_key = key
if table_metadata_key == None:
    print(f"CSV {table_name_full} not in metadata.json defined!")
    #continue

df = pd.read_csv(join(os.environ["SportsTables"], "basketball", "nba_season_team_stats_2021.csv"))

data_list = []

for i in list(range(len(df.columns))):
    column_name = df.columns[i]
    
    if column_name in metadata[table_metadata_key]["textual_cols"].keys():
        column_data_type = "textual"
        column_label = metadata[table_metadata_key]["textual_cols"][column_name]
    elif column_name in metadata[table_metadata_key]["numerical_cols"].keys():
        column_data_type = "numerical"
        column_label = metadata[table_metadata_key]["numerical_cols"][column_name]
    else:
        print(f"Column {df.columns[i]} in {table_name} not labeled in metadata.json!")
        continue
    
    data_list.append([
        table_name,  # table name
        column_name,  # column name
        column_data_type,
        column_label,
        " ".join([str(x)
                    for x in df.iloc[:, i].dropna().tolist()]),
    ])
    
df = pd.DataFrame(data_list, columns=[
                    "table_name", "column_name", "columns_data_type", "column_label", "data"])
df["data_tensor"] = tokenizer([seq for seq in df["data"].tolist()], padding=True, max_length=120, truncation=True)["input_ids"]
df["label_tensor"] = df["column_label"].apply(
                    lambda x: torch.LongTensor(label_enc.transform([x])))
                    
tokenized_table_name = tokenizer(table_name)["input_ids"]

## build graph
textual_node_ids = df[df["columns_data_type"] == "textual"].index.tolist()
numerical_node_ids = df[df["columns_data_type"]
                        == "numerical"].index.tolist()
graph_data = {}

# table_name to each column
source_nodes = []
target_nodes = []
for column_id in df.index.tolist():
    source_nodes.append(0)
    target_nodes.append(column_id)
graph_data[('table', 'to', 'column')] = (torch.tensor(source_nodes), torch.tensor(target_nodes))

# text to num columns
source_nodes = []
target_nodes = []
for numerical_node_id in numerical_node_ids:
    for textual_node_id in textual_node_ids:
        source_nodes.append(textual_node_id)
        target_nodes.append(numerical_node_id)
graph_data[('column', 'to', 'column')] = (torch.tensor(source_nodes), torch.tensor(target_nodes))    

# numerical feature node to numerical column
# build numerical feature node ids from 0 to len(numerical_node_ids) with the usage of idx
source_nodes = []
target_nodes = []
for idx, numerical_node_id in enumerate(numerical_node_ids):
    source_nodes.append(idx)
    target_nodes.append(numerical_node_id)
graph_data[('num_feature_node', 'to', 'column')] = (torch.tensor(source_nodes), torch.tensor(target_nodes))

g = dgl.heterograph(graph_data) 

# set features to the nodes
g.nodes["table"].data["data_tensor"] = torch.tensor([tokenized_table_name])
g.nodes["column"].data["data_tensor"] = torch.LongTensor(df["data_tensor"].tolist())
g.nodes["column"].data["label_tensor"] = torch.LongTensor(df["label_tensor"].tolist())

num_features = []
for idx, row in df.iterrows():
    if row['columns_data_type'] == "numerical":
        sherlock_feature = df_sherlock_features.query(f"locator == 'basketball' & dataset_id == '{row['table_name']}.csv' & header == '{row['column_name']}'")[selected_sherlock_feature_set].iloc[0].tolist()
        num_stats_feature = df_num_stats_features.query(f"domain == 'basketball' & table_name == '{row['table_name']}.csv' & column_name == '{row['column_name']}'")["features"].iloc[0]
        num_stats_feature = literal_eval(num_stats_feature)
        num_features.append(sherlock_feature+num_stats_feature)
g.nodes["num_feature_node"].data["data_tensor"] = torch.tensor(num_features)        


# MLP

In [None]:
import torch
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(10,11),
    nn.ReLU()
)

test = torch.FloatTensor([0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0])

model(test)

In [None]:
from transformers import BertModel, BertConfig

bert = BertModel.from_pretrained("bert-base-uncased")

bert.config.hidden_size

# GitTables Test

In [None]:
from GitTables_data_loader import get_LabelEncoder
import pyarrow.parquet as pq
import dgl

label_enc = get_LabelEncoder()

In [None]:
len(label_enc.classes_)

In [None]:
import pandas as pd
import os
from os.path import join
import json

from transformers import BertTokenizer, BertModel
import torch

#model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

random_state = 1
split="train"

## load train_valid_test split definitions
with open(join(os.environ["GitTables"], "data", f"train_valid_test_split_{random_state}.json")) as f:
    train_valid_test_split = json.load(f)
    
for idx_table_path, table_path in enumerate(train_valid_test_split[split]):
    if idx_table_path > 0:
        break
    print(table_path)
    
    # read metadata
    table_metadata = json.loads(pq.read_schema(join(os.environ["GitTables"], table_path)).metadata[b"gittables"])
    dbpedia_types = table_metadata["dbpedia_embedding_column_types"]
    dbpedia_similarities = table_metadata["dbpedia_embedding_similarities"]
    
    # read the table in a DF
    df_table = pd.read_parquet(join(os.environ["GitTables"], table_path))
    
    data_list = []
    
    for i in list(range(len(df_table.columns))):
        column_name = df_table.columns[i]
        
        try:
            if dbpedia_similarities[column_name] > 0.0: # in case we want to filter assigned types regarding the similarity score later on
                if table_metadata["dtypes"][column_name] == "object" or table_metadata["dtypes"][column_name] == "string":
                    column_data_type = "textual"
                    column_label = dbpedia_types[column_name]["cleaned_label"]
                else:
                    column_data_type = "numerical"
                    column_label = dbpedia_types[column_name]["cleaned_label"]
        except Exception as e:
            print(e)
            print(f"Not considering column: {column_name} from table: {table_path}")
            
        data_list.append([
            table_path,
            column_name,
            column_data_type,
            column_label,
            " ".join([str(x) for x in df_table.iloc[:, i].dropna().tolist()])
        ])
    
    df = pd.DataFrame(data_list, columns=[
                            "table_name", "column_name", "columns_data_type", "column_label", "data"])
    if len(df) == 0:
        print(f"Table {table_name} has no columns with assigned semantic types!")
        continue
    
    df = df.dropna().reset_index(drop=True)
    
    df["data_tensor"] = tokenizer([seq for seq in df["data"].tolist()], padding=True, max_length=128 + 2, truncation=True)["input_ids"]
    df["label_tensor"] = df["column_label"].apply(
        lambda x: torch.LongTensor(label_enc.transform([x])).to(torch.device("cpu")))
    
    # tokenize table name
    tokenized_table_name = tokenizer(table_path.split("/")[-1].split(".parquet")[0])["input_ids"]

In [None]:
df

In [None]:
from GitTables_data_loader import GitTablesDGLDataset_enriched, get_LabelEncoder, GitTablesDGLDataset_enriched_shadow_num_nodes
from transformers import BertTokenizer, BertModel
import torch

model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = GitTablesDGLDataset_enriched_shadow_num_nodes(tokenizer=tokenizer, device=torch.device("cuda"), max_length=254, random_state=1, shuffle_cols=False, force_reload=True)

In [None]:
train_dataset[0]

In [None]:
train_dataset[0].edges(form="uv", etype=('num_column', 'provide_info', 'num_column'))

In [None]:
train_dataset[0].nodes["column"].data["data_tensor"]

In [None]:
import pandas as pd

results = []
for i,dataset in enumerate(train_dataset):
    # if i > 0:
    #     break
    #print(len(dataset.nodes["column"].data["data_tensor"][0]))
    columns = len(dataset.nodes["column"].data["data_tensor"])
    num_cols = len(dataset.nodes["num_feature_node"].data["data_tensor"])
    
    results.append([columns, num_cols, columns-num_cols])
    
    
df = pd.DataFrame(results, columns=["#cols", "#num_cols", "#text_cols"])

In [None]:
df.sort_values(by="#cols")

In [None]:
df[(df["#num_cols"]/df["#cols"] >= 0.8)]

In [None]:
df[(df["#num_cols"]/df["#cols"] >= 0.8) & (df["#text_cols"] > 0)]

In [None]:
torch.tensor([tokenizer("Tablename", padding="max_length", max_length=10, truncation=True)["input_ids"]])

In [None]:
def fill_tensor_with_zeros(tensor, desired_length, dim=0):
    current_length = tensor.size(dim)
    if current_length < desired_length:
        pad_size = [0] * tensor.dim()
        pad_size[dim] = desired_length - current_length
        tensor = torch.cat((tensor, torch.zeros(*pad_size)), dim=dim)
    return tensor


fill_tensor_with_zeros(train_dataset[0].nodes["column"].data["label_tensor"],10)

In [None]:
def fill_list_with_zeros(lst, desired_length):
    if len(lst) < desired_length:
        lst += [0] * (desired_length - len(lst))
    return lst

# Example usage
my_list = [1, 2, 3]
desired_length = 5
filled_list = fill_list_with_zeros(my_list, desired_length)
print(filled_list)

In [None]:
import torch

def fill_tensor_with_zeros(tensor, desired_length, dim=0):
    current_length = tensor.size(dim)
    if current_length < desired_length:
        pad_size = [0] * tensor.dim()
        pad_size[dim] = desired_length - current_length
        tensor = torch.cat((tensor, torch.zeros(*pad_size)), dim=dim)
    return tensor

# Example usage
my_tensor = torch.tensor([1, 2, 3])
desired_length = 5
filled_tensor = fill_tensor_with_zeros(my_tensor, desired_length)
print(filled_tensor)

In [None]:
for batch in train_dataset:
    if batch.nodes["column"].data["label_tensor"].shape[0] > 30:
        print(batch)

In [None]:
dgl.batch([train_dataset.data_list[0], train_dataset.data_list[1]])

## GitTables inspect different columns and their values

In [None]:
from GitTables_data_loader import get_LabelEncoder


label_enc = get_LabelEncoder()

## load Satos header_valid to select specific with semantic types
import pandas as pd
from ast import literal_eval

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df_header_valid = pd.read_csv("/ext/daten-wi/slangenecker/sato/extract/out/headers/type_gittables/GitTables_type_gittables_header_valid.csv")

df_header_valid["field_list"] = df_header_valid["field_list"].apply(literal_eval)
df_header_valid["field_names"] = df_header_valid["field_names"].apply(literal_eval)
df_header_valid["field_names"] = df_header_valid["field_names"].apply(label_enc.inverse_transform)

In [None]:
search_semantic_type = "long name_nt"

# Convert the field_names column to lowercase to make the search case-insensitive
df_header_valid['field_names'] = df_header_valid['field_names'].apply(lambda x: [item.lower() for item in x])

# Check if 'death date' is in the list of field names for each row
death_date_rows = df_header_valid[df_header_valid['field_names'].apply(lambda x: search_semantic_type in x)]

death_date_rows

In [None]:
df_header_valid['field_names'] = df_header_valid['field_names'].apply(lambda x: [item.lower() for item in x])

# Iterate through the DataFrame rows

filtered_rows = []
for index, row in df_header_valid.iterrows():
    if search_semantic_type in row['field_names']:
        search_semantic_type_index = row['field_names'].index(search_semantic_type)
        death_date_column = row['field_list'][search_semantic_type_index]
        filtered_rows.append((row["locator"], row["dataset_id"], death_date_column))

# Create a new DataFrame with the filtered rows and corresponding columns
filtered_df = pd.DataFrame(filtered_rows, columns=['locator', "dataset_id", f'column'])

filtered_df.head(5)

In [None]:
import pandas as pd
# Set the maximum column width to a high value to prevent truncation
pd.set_option('display.max_colwidth', None)
# Set the maximum number of rows displayed to a high value to show all rows
pd.set_option('display.max_rows', None)
import pyarrow.parquet as pq
import json
from os.path import join

data_list = []
for index, row in filtered_df.iterrows():
    # if index == 100:
    #     break
    
    table_metadata = json.loads(pq.read_schema(join("/ext/daten-wi/slangenecker/gittables/data/", row["locator"], row["dataset_id"])).metadata[b"gittables"])
    dbpedia_types = table_metadata["dbpedia_embedding_column_types"]
    dbpedia_similarities = table_metadata["dbpedia_embedding_similarities"]

    df_table = pd.read_parquet(join("/ext/daten-wi/slangenecker/gittables/data/", row["locator"], row["dataset_id"]))
    
    data_list.append([row["locator"], row["dataset_id"],row["column"], table_metadata["dtypes"][df_table.columns[row["column"]]],dbpedia_types[df_table.columns[row["column"]]]["cleaned_label"], dbpedia_similarities[df_table.columns[row["column"]]], df_table.iloc[:, row["column"]].tolist()])
    
df_result = pd.DataFrame(data_list, columns=["locator", "table_name", "column", "dtype", "semantic_type", "sem_type_sim" , "data"])

In [None]:
df_result

In [None]:
table_metadata["dtypes"][df_table.columns[row["column"]]]

In [None]:
dbpedia_types[df_table.columns[row["column"]]]["cleaned_label"]

In [None]:
dbpedia_similarities[df_table.columns[row["column"]]]

In [None]:
table_metadata = json.loads(pq.read_schema("/ext/daten-wi/slangenecker/gittables//data/object_tables/query_51_9.parquet").metadata[b"gittables"])
table_metadata.keys()

In [None]:
import torch

In [None]:
textual_node_ids = [0,1]
numerical_node_ids = [2,3,4]

textual_to_numerical_source = torch.tensor(range(len(textual_node_ids))).repeat(len(numerical_node_ids)).to(torch.int64)
textual_to_numerical_target = torch.tensor(range(len(numerical_node_ids))).repeat(len(textual_node_ids)).to(torch.int64)


In [None]:
textual_to_numerical_source

In [None]:
import torch
numerical_node_ids = [2, 3, 4]

# num to num columns
numerical_to_numerical_source = []
numerical_to_numerical_target = []
for i in numerical_node_ids:
    for j in numerical_node_ids:
        if i != j:  # Skip connecting a node to it
            numerical_to_numerical_source.append(i)
            numerical_to_numerical_target.append(j)
numerical_to_numerical_source = torch.tensor(numerical_to_numerical_source).to(torch.int64)
numerical_to_numerical_target = torch.tensor(numerical_to_numerical_target).to(torch.int64)

In [None]:
numerical_to_numerical_source

In [None]:
numerical_to_numerical_target

In [6]:
from dotenv import load_dotenv
load_dotenv(override=True)
from os.path import join
import pandas as pd
import json
import random
from transformers import BertTokenizer, BertModel
import torch
from random import randrange
import os

#model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

sport_domains = ["basketball"]
random_state = 1
split = "train"
shuffle_cols = True

# load gpt generated column names if neccessary
if True:
    with open(join(os.environ["MAIN_DIR"],"gpt","SportsTables_semantic_type_abbreviations.json")) as f:
        gpt_colnames = json.load(f)


# load sherlock features
df_sherlock_features = pd.read_csv("/home/slangenecker/sato/extract/out/features/type_SportsTables/SportsTables_type_SportsTables_sherlock_features.csv")
with open(join(os.environ["MAIN_DIR"], "data_loader","valid_sherlock_features.json"), "r") as f:
    selected_sherlock_feature_set = json.load(f)
df_sherlock_features = df_sherlock_features[["locator", "dataset_id", "field_id", "header"]+selected_sherlock_feature_set]

# load additional numerical stats features
df_num_stats_features = pd.read_csv(join(os.environ["MAIN_DIR"],"data_loader", "features","SportsTables_numerical_statistic_features.csv"))

# load all tables from SportsTables, tokenize every table column as described in Doduo and build the graph representation
data_list = []
for idx_sport_domain, sport_domain in enumerate(sport_domains):
    if idx_sport_domain > 0:
        break
    # load metadata.json containing semantic types of the columns
    with open(join(os.environ["SportsTables"], sport_domain, "metadata.json")) as f:
        metadata = json.load(f)
    with open(join(os.environ["SportsTables"], sport_domain, f"train_valid_test_split_{random_state}.json")) as f:
        train_valid_test_split = json.load(f)

    for idx_table_path, table_name_full in enumerate(train_valid_test_split[split]):
        if idx_table_path != 0:
            continue
        table_name = table_name_full.split("/")[-1].split(".csv")[0]
        ## search for correct in key in metadata
        table_metadata_key = None
        for key in metadata.keys():
            if key in table_name:
                table_metadata_key = key
        if table_metadata_key == None:
            print(f"CSV {table_name_full} not in metadata.json defined!")
            continue

        df = pd.read_csv(join(os.environ["SportsTables"], sport_domain, table_name_full))
        data_list = []
        if shuffle_cols:
            column_list = list(range(len(df.columns)))
            #random.seed(random_state)
            random.shuffle(column_list)
        else:
            column_list = list(range(len(df.columns)))
            
        for i in column_list:
            column_name = df.columns[i]
            # search for defined columns data type and semantic label in metadata
            if column_name in metadata[table_metadata_key]["textual_cols"].keys():
                column_data_type = 0 # => "textual"
                column_label = metadata[table_metadata_key]["textual_cols"][column_name]
            elif column_name in metadata[table_metadata_key]["numerical_cols"].keys():
                column_data_type = 1 # => "numerical"
                column_label = metadata[table_metadata_key]["numerical_cols"][column_name]
            else:
                print(f"Column {df.columns[i]} in {table_name} not labeled in metadata.json!")
                continue
            
            # sign gpt generate colname if enabled. select randomly one name out of 10 provided
            # if True:
            #     column_name = gpt_colnames[column_label][randrange(10)]
            
            data_list.append([
                table_name,  # table name
                column_name,  # column name
                column_data_type,
                column_label,
                " ".join([str(x)
                            for x in df.iloc[:, i].dropna().tolist()]),
            ])
        df = pd.DataFrame(data_list, columns=[
                                "table_name", "column_name", "columns_data_type", "column_label", "data"])
        if len(df) == 0:
            print(f"Table {table_name} has no columns with assigned semantic types!")
            continue
        # df["data_tensor"] = df["data"].apply(
        #     lambda x: torch.LongTensor(
        #         tokenizer.encode(
        #             x, return_tensors="pt", add_special_tokens=True, max_length=max_length + 2, truncation=True)).to(
        #                 device)).tolist()
        
        df = df.dropna().reset_index(drop=True)
        
        ## assign numerical feature set to the columns
        
        #df["data_tensor"] = tokenizer([seq for seq in df["data"].tolist()], padding=True, max_length=max_length + 2, truncation=True)["input_ids"]
        #df["data_tensor"] = tokenizer([seq for seq in df["data"].tolist()], padding="max_length", max_length=max_length + 2, truncation=True)["input_ids"]
        #df["label_tensor"] = df["column_label"].apply(
        #    lambda x: torch.LongTensor(label_enc.transform([x])).to(device))
        
        # tokenize table name
        #tokenized_table_name = tokenizer(table_name)["input_ids"]
        #tokenized_table_name = tokenizer(table_name, padding="max_length", max_length=20, truncation=True)["input_ids"]


In [7]:
df

Unnamed: 0,table_name,column_name,columns_data_type,column_label,data
0,nba_season_team_stats_2021,Team,0,basketball.team.name,New York Knicks* Los Angeles Lakers* Utah Jazz...
1,nba_season_team_stats_2021,FG%,1,basketball.team.field_goals_percentage,0.44 0.46 0.447 0.464 0.459 0.453 0.467 0.469 ...
2,nba_season_team_stats_2021,DRB,1,basketball.team.defensive_rebounds_per_game,34.4 33.2 32.8 32.0 33.4 33.2 33.3 32.1 34.2 3...
3,nba_season_team_stats_2021,PF,1,basketball.team.fouls_per_game,17.9 21.3 19.0 18.1 19.6 21.0 18.0 19.2 20.1 2...
4,nba_season_team_stats_2021,MP,1,basketball.team.minutes_per_game,242.1 242.4 241.0 240.0 241.4 242.1 242.8 242....
5,nba_season_team_stats_2021,TRB,1,basketball.team.rebounds_per_game,44.1 42.0 42.6 41.2 42.9 42.9 42.7 41.2 43.9 4...
6,nba_season_team_stats_2021,3P,1,basketball.team.3-point_field_goals_per_game,12.0 11.4 10.9 11.6 14.3 11.9 11.6 13.0 12.8 1...
7,nba_season_team_stats_2021,FT%,1,basketball.team.free_throws_percentage,0.761 0.776 0.768 0.777 0.782 0.764 0.778 0.77...
8,nba_season_team_stats_2021,TOV,1,basketball.team.turnovers_per_game,12.8 15.2 11.5 12.8 15.1 15.6 13.6 14.1 12.6 1...
9,nba_season_team_stats_2021,FT,1,basketball.team.free_throws_per_game,16.9 15.9 14.5 16.1 15.6 17.4 16.8 17.1 17.2 1...


In [11]:
25*[0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
df["gpt_colname"] = [gpt_colnames[collabel][randrange(10)] for collabel in df["column_label"].tolist()]

In [29]:
df["data"] = df["column_name"] + " " + df["data"]
df

Unnamed: 0,table_name,column_name,columns_data_type,column_label,data
0,nba_season_team_stats_2021,FG,1,basketball.team.field_goals_per_game,FG 37.9 39.8 40.9 40.1 39.1 39.4 40.5 40.0 40....
1,nba_season_team_stats_2021,FGA,1,basketball.team.field_goals_attempts_per_game,FGA 86.2 86.4 91.4 86.4 85.2 86.9 86.8 85.4 87...
2,nba_season_team_stats_2021,FT,1,basketball.team.free_throws_per_game,FT 16.9 15.9 14.5 16.1 15.6 17.4 16.8 17.1 17....
3,nba_season_team_stats_2021,3PA,1,basketball.team.3-point_field_goals_attempts_p...,3PA 35.6 32.2 31.8 32.5 39.1 33.0 32.8 35.7 35...
4,nba_season_team_stats_2021,MP,1,basketball.team.minutes_per_game,MP 242.1 242.4 241.0 240.0 241.4 242.1 242.8 2...
5,nba_season_team_stats_2021,TOV,1,basketball.team.turnovers_per_game,TOV 12.8 15.2 11.5 12.8 15.1 15.6 13.6 14.1 12...
6,nba_season_team_stats_2021,AST,1,basketball.team.assists_per_game,AST 23.6 24.7 22.3 22.9 25.8 23.5 22.9 25.9 22...
7,nba_season_team_stats_2021,FG%,1,basketball.team.field_goals_percentage,FG% 0.44 0.46 0.447 0.464 0.459 0.453 0.467 0....
8,nba_season_team_stats_2021,ORB,1,basketball.team.offensive_rebounds_per_game,ORB 9.7 8.8 9.8 9.2 9.5 9.8 9.4 9.1 9.8 10.0 9...
9,nba_season_team_stats_2021,FTA,1,basketball.team.three_throw_attempts_per_game,FTA 22.2 20.5 18.9 20.7 19.9 22.8 21.7 22.1 22...


In [31]:
df["data_tensor"] = tokenizer([seq for seq in df["data"].tolist()], padding=True, max_length=10 + 2, truncation=True)["input_ids"]
df

Unnamed: 0,table_name,column_name,columns_data_type,column_label,data,data_tensor
0,nba_season_team_stats_2021,FG,1,basketball.team.field_goals_per_game,FG 37.9 39.8 40.9 40.1 39.1 39.4 40.5 40.0 40....,"[101, 1042, 2290, 4261, 1012, 1023, 4464, 1012..."
1,nba_season_team_stats_2021,FGA,1,basketball.team.field_goals_attempts_per_game,FGA 86.2 86.4 91.4 86.4 85.2 86.9 86.8 85.4 87...,"[101, 1042, 3654, 6564, 1012, 1016, 6564, 1012..."
2,nba_season_team_stats_2021,FT,1,basketball.team.free_throws_per_game,FT 16.9 15.9 14.5 16.1 15.6 17.4 16.8 17.1 17....,"[101, 3027, 2385, 1012, 1023, 2321, 1012, 1023..."
3,nba_season_team_stats_2021,3PA,1,basketball.team.3-point_field_goals_attempts_p...,3PA 35.6 32.2 31.8 32.5 39.1 33.0 32.8 35.7 35...,"[101, 1017, 4502, 3486, 1012, 1020, 3590, 1012..."
4,nba_season_team_stats_2021,MP,1,basketball.team.minutes_per_game,MP 242.1 242.4 241.0 240.0 241.4 242.1 242.8 2...,"[101, 6131, 22431, 1012, 1015, 22431, 1012, 10..."
5,nba_season_team_stats_2021,TOV,1,basketball.team.turnovers_per_game,TOV 12.8 15.2 11.5 12.8 15.1 15.6 13.6 14.1 12...,"[101, 2000, 2615, 2260, 1012, 1022, 2321, 1012..."
6,nba_season_team_stats_2021,AST,1,basketball.team.assists_per_game,AST 23.6 24.7 22.3 22.9 25.8 23.5 22.9 25.9 22...,"[101, 2004, 2102, 2603, 1012, 1020, 2484, 1012..."
7,nba_season_team_stats_2021,FG%,1,basketball.team.field_goals_percentage,FG% 0.44 0.46 0.447 0.464 0.459 0.453 0.467 0....,"[101, 1042, 2290, 1003, 1014, 1012, 4008, 1014..."
8,nba_season_team_stats_2021,ORB,1,basketball.team.offensive_rebounds_per_game,ORB 9.7 8.8 9.8 9.2 9.5 9.8 9.4 9.1 9.8 10.0 9...,"[101, 19607, 1023, 1012, 1021, 1022, 1012, 102..."
9,nba_season_team_stats_2021,FTA,1,basketball.team.three_throw_attempts_per_game,FTA 22.2 20.5 18.9 20.7 19.9 22.8 21.7 22.1 22...,"[101, 3027, 2050, 2570, 1012, 1016, 2322, 1012..."
