In [1]:
import json, os
for k,v in json.load(open("local.settings.json"))["Values"].items():
    os.environ[k] = v

In [2]:
from libs.data import register_binding, from_bind
import os 
if not from_bind("fsq"):
    register_binding(
        "fsq",
        "Structured",
        "sql",
        url=os.environ["DATABIND_SQL_FOURSQUARE"],
        schemas=["dbo"],
    )
if not from_bind("legato"):
    register_binding(
        "legato",
        "Structured",
        "sql",
        url=os.environ["DATABIND_SQL_KEYSTONE"],
        schemas=["poi"],
    )

In [None]:
import json
import pandas as pd

# Create connection objects (adjust if needed)
fsq_conn = from_bind("fsq").connect().connection()
legato_conn = from_bind("legato").connect().connection()

# Define your SQL query (ensure it returns all desired columns, including the unique id)
sql_query = "SELECT DISTINCT * FROM dbo.poi"

# Set the batch size
chunksize = 100000
first_chunk = True
total_records_inserted = 0

# Global containers for deduplication and many-to-many mappings
# For categories: key = category_id, value = category_label
categories_global = {}
# Set of tuples: (poi_id, category_id)
poi_categories_set = set()

# For chains: key = chain_id, value = chain_name
chains_global = {}
# Set of tuples: (poi_id, chain_id)
poi_chains_set = set()

# Stream the data in chunks
for chunk in pd.read_sql(sql=sql_query, con=fsq_conn, chunksize=chunksize):
    chunk.rename(columns={"fsq_id": "id"}, inplace=True)
    # Convert the date columns explicitly to Python date objects
    for col in ['date_created', 'date_refreshed', 'date_closed']:
        if col in chunk.columns:
            chunk[col] = pd.to_datetime(chunk[col]).dt.date

    # Process each row for categories and chains
    def process_row(row):
        poi_id = row['id']  # assuming the primary key is "id"
        
        # Process categories if both columns are available and not null
        cat_ids_val = row.get('fsq_category_ids')
        cat_labels_val = row.get('fsq_category_labels')
        if pd.notnull(cat_ids_val) and pd.notnull(cat_labels_val):
            try:
                cat_ids = json.loads(cat_ids_val)
                cat_labels = json.loads(cat_labels_val)
            except Exception as e:
                cat_ids, cat_labels = [], []
            # Pair each id with its corresponding label
            for cat_id, cat_label in zip(cat_ids, cat_labels):
                # Update the global categories dict (deduplication)
                if cat_id not in categories_global:
                    categories_global[cat_id] = cat_label
                # Add the mapping (POI to category) to the set
                poi_categories_set.add((poi_id, cat_id))
        
        # Process chains similarly
        chain_ids_val = row.get('fsq_chain_id')
        chain_names_val = row.get('fsq_chain_name')
        if pd.notnull(chain_ids_val) and pd.notnull(chain_names_val):
            try:
                chain_ids = json.loads(chain_ids_val)
                chain_names = json.loads(chain_names_val)
            except Exception as e:
                chain_ids, chain_names = [], []
            for chain_id, chain_name in zip(chain_ids, chain_names):
                if chain_id not in chains_global:
                    chains_global[chain_id] = chain_name
                poi_chains_set.add((poi_id, chain_id))
    
    # Apply the row-level processing for categories and chains
    chunk.apply(process_row, axis=1)
    
    # Write the main POI data to SQL.
    # The first chunk will replace the table; subsequent chunks will append.
    chunk.to_sql(
        name="foursquare",
        con=legato_conn,
        schema="poi",
        if_exists="replace" if first_chunk else "append",
        index=False
    )
    
    # Print progress information
    records_in_chunk = len(chunk)
    total_records_inserted += records_in_chunk
    print(f"Inserted {records_in_chunk} records in this iteration. Total inserted so far: {total_records_inserted}.")
    
    first_chunk = False

# After processing all chunks, create DataFrames for the related tables

# Categories table: unique category_id and category_label pairs
categories_df = pd.DataFrame([
    {"category_id": cat_id, "category_label": cat_label}
    for cat_id, cat_label in categories_global.items()
])

# Many-to-many mapping for POI to categories
poi_categories_df = pd.DataFrame(list(poi_categories_set), columns=["poi_id", "category_id"])

# Chains table: unique chain_id and chain_name pairs
chains_df = pd.DataFrame([
    {"chain_id": chain_id, "chain_name": chain_name}
    for chain_id, chain_name in chains_global.items()
])

# Many-to-many mapping for POI to chains
poi_chains_df = pd.DataFrame(list(poi_chains_set), columns=["poi_id", "chain_id"])

# Write the related tables to SQL. Use "replace" to create new tables.
categories_df.to_sql(
    name="foursquare_categories",
    con=legato_conn,
    schema="poi",
    if_exists="replace",
    index=False
)
poi_categories_df.to_sql(
    name="foursquare_poi_categories",
    con=legato_conn,
    schema="poi",
    if_exists="replace",
    index=False
)
chains_df.to_sql(
    name="foursquare_chains",
    con=legato_conn,
    schema="poi",
    if_exists="replace",
    index=False
)
poi_chains_df.to_sql(
    name="foursquare_poi_chains",
    con=legato_conn,
    schema="poi",
    if_exists="replace",
    index=False
)

legato_conn.commit()
print("Completed processing related tables for categories and chains.")
