In [4]:
import json

# Pfad zur Datei
# file_path = "Spider-Syn-main/Spider-Syn/dev.json"
# file_path = "Spider-Syn-main/preprocessed_dataset/tables.json"
# file_path = "Spider-Syn-main/preprocessed_dataset/train_spider.json"
file_path = "Spider-Syn-main/Spider-Syn/dev.json"


# JSON-Datei laden
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Struktur analysieren (rekursiv alle Schlüssel extrahieren)
def extract_structure(obj):
    if isinstance(obj, dict):
        return {k: extract_structure(v) for k, v in obj.items()}
    elif isinstance(obj, list) and obj:
        return [extract_structure(obj[0])]
    else:
        return type(obj).__name__
    

# Struktur ausgeben
structure = extract_structure(data)
structure


[{'db_id': 'str',
  'SpiderQuestion': 'str',
  'SpiderSynQuestion': 'str',
  'query': 'str'}]

In [5]:
from src.flemmings_classes import QuestionAnswerMapping, TableMapping, ColumnMapping, MappingDB

In [6]:
from src.flemmings_functions import load_dev_mappings, custom_encoder

In [7]:
if __name__ == "__main__":
    file_path_qa = "Spider-Syn-main/Spider-Syn/dev.json"
    question_answer_mappings = load_dev_mappings(file_path_qa)
    print(f"Loaded {len(question_answer_mappings)} QuestionAnswerMapping objects.")


Loaded 1034 QuestionAnswerMapping objects.


In [8]:
if __name__ == "__main__":
    file_path = "Spider-Syn-main/Spider-Syn/dev.json"
    question_answer_mappings = load_dev_mappings(file_path)
    print(f"Loaded {len(question_answer_mappings)} QuestionAnswerMapping objects.")

    example_mapping = question_answer_mappings[19]

    # Serialize the example object to a JSON formatted string
    json_str = json.dumps(example_mapping, default=custom_encoder, indent=2, ensure_ascii=False)
    print("Example QuestionAnswerMapping object:")
    print(json_str)


Loaded 1034 QuestionAnswerMapping objects.
Example QuestionAnswerMapping object:
{
  "db_id": "concert_singer",
  "spider_question": "What is the name and capacity for the stadium with the highest average attendance?",
  "spider_syn_question": "What is the name and number of seats for the stadium with the highest average attendance?",
  "query": "SELECT name ,  capacity FROM stadium ORDER BY average DESC LIMIT 1",
  "generated_query": null,
  "spider_syn_embedding": null
}


In [9]:
import json
from typing import Dict, List

# Assuming these classes are already defined:
# TableMapping, ColumnMapping, MappingDB

from src.flemmings_functions import load_mapping_dbs


if __name__ == "__main__":
    file_path = "Spider-Syn-main/preprocessed_dataset/tables.json"
    mapping_db_dict = load_mapping_dbs(file_path)


Erstellte MappingDB-Objekte: 166


In [11]:
import json

# Wähle ein Beispielobjekt aus
example_db = next(iter(mapping_db_dict.values()))

# Benutzerdefinierte Funktion für den JSON-Encoder
def custom_encoder(obj):
    if hasattr(obj, '__dict__'):
        return obj.__dict__
    # Bei Listen oder anderen iterierbaren Typen wird rekursiv serialisiert
    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

# Serialisiere das Objekt unter Verwendung der custom_encoder-Funktion
json_str = json.dumps(example_db, default=custom_encoder, indent=2, ensure_ascii=False)
print(json_str)


{
  "db_id": "perpetrator",
  "table_mappings": [
    {
      "table_name": "perpetrator",
      "original_table_name": "perpetrator"
    },
    {
      "table_name": "people",
      "original_table_name": "people"
    }
  ],
  "column_mappings": [
    {
      "table_name": "",
      "column_name": "*",
      "original_column_name": "*"
    },
    {
      "table_name": "perpetrator",
      "column_name": "perpetrator id",
      "original_column_name": "Perpetrator_ID"
    },
    {
      "table_name": "perpetrator",
      "column_name": "people id",
      "original_column_name": "People_ID"
    },
    {
      "table_name": "perpetrator",
      "column_name": "date",
      "original_column_name": "Date"
    },
    {
      "table_name": "perpetrator",
      "column_name": "year",
      "original_column_name": "Year"
    },
    {
      "table_name": "perpetrator",
      "column_name": "location",
      "original_column_name": "Location"
    },
    {
      "table_name": "perpetrator",
     

In [12]:
print(f"Es existieren folgende Tabellen:\n{list(mapping_db_dict.keys())}\n\n")

print(f"Anzahl Tabellen insgesamt:\n{len(list(mapping_db_dict.keys()))}")

Es existieren folgende Tabellen:
['perpetrator', 'college_2', 'flight_company', 'icfp_1', 'body_builder', 'storm_record', 'pilot_record', 'race_track', 'academic', 'department_store', 'music_4', 'insurance_fnol', 'cinema', 'decoration_competition', 'phone_market', 'store_product', 'assets_maintenance', 'student_assessment', 'dog_kennels', 'music_1', 'company_employee', 'farm', 'solvency_ii', 'city_record', 'swimming', 'flight_2', 'election', 'manufactory_1', 'debate', 'network_2', 'local_govt_in_alabama', 'climbing', 'e_learning', 'scientist_1', 'ship_1', 'entertainment_awards', 'allergy_1', 'imdb', 'products_for_hire', 'candidate_poll', 'chinook_1', 'flight_4', 'pets_1', 'dorm_1', 'journal_committee', 'flight_1', 'medicine_enzyme_interaction', 'local_govt_and_lot', 'station_weather', 'shop_membership', 'driving_school', 'concert_singer', 'music_2', 'sports_competition', 'railway', 'inn_1', 'museum_visit', 'browser_web', 'baseball_1', 'architecture', 'csu_1', 'tracking_orders', 'insura

In [13]:
# Beispiel: Erstes QuestionAnswerMapping-Objekt anzeigen
first_qam = question_answer_mappings[912]

print("=== QuestionAnswerMapping ===")
print(f"DB ID: {first_qam.db_id}")
print(f"Originalfrage (Spider): {first_qam.spider_question}")
print(f"Synthetisierte Frage (SpiderSyn): {first_qam.spider_syn_question}")
print(f"Query: {first_qam.query}")
print(f"Generated Query: {first_qam.generated_query}")
print(f"Embedding (SpiderSyn): {first_qam.spider_syn_embedding}")


=== QuestionAnswerMapping ===
DB ID: network_1
Originalfrage (Spider): How many likes does Kyle have?
Synthetisierte Frage (SpiderSyn): How many interests does Kyle have?
Query: SELECT count(*) FROM Likes AS T1 JOIN Highschooler AS T2 ON T1.student_id  =  T2.id WHERE T2.name  =  "Kyle"
Generated Query: None
Embedding (SpiderSyn): None


In [15]:
# Hole das erste MappingDB-Objekt aus dem Dictionary
first_db_id = list(mapping_db_dict.keys())[19]
first_mapping_db = mapping_db_dict[first_db_id]

print("=== MappingDB ===")
print(f"DB ID: {first_mapping_db.db_id}")

print("\nTabellen-Mappings:")
for table in first_mapping_db.table_mappings:
    print(f"- {table.table_name} (Original: {table.original_table_name})")

print("\nSpalten-Mappings:")
for column in first_mapping_db.column_mappings:
    print(f"- {column.column_name} (Original: {column.original_column_name}, Tabelle: {column.table_name})")

print("\nTable-Embeddings (nur Keys):")
print(list(first_mapping_db.table_embeddings.keys()))

print("\nColumn-Embeddings (nur Keys):")
print(list(first_mapping_db.column_embeddings.keys()))


=== MappingDB ===
DB ID: music_1

Tabellen-Mappings:
- genre (Original: genre)
- artist (Original: artist)
- files (Original: files)
- song (Original: song)

Spalten-Mappings:
- * (Original: *, Tabelle: )
- genre name (Original: g_name, Tabelle: genre)
- rating (Original: rating, Tabelle: genre)
- most popular in (Original: most_popular_in, Tabelle: genre)
- artist name (Original: artist_name, Tabelle: artist)
- country (Original: country, Tabelle: artist)
- gender (Original: gender, Tabelle: artist)
- preferred genre (Original: preferred_genre, Tabelle: artist)
- song id (Original: f_id, Tabelle: files)
- artist name (Original: artist_name, Tabelle: files)
- file size (Original: file_size, Tabelle: files)
- duration (Original: duration, Tabelle: files)
- formats (Original: formats, Tabelle: files)
- song name (Original: song_name, Tabelle: song)
- artist name (Original: artist_name, Tabelle: song)
- country (Original: country, Tabelle: song)
- song id (Original: f_id, Tabelle: song)
-