#### 1. Importing libraries and setting up initial configurations

In [9]:
import duckdb
import os

In [10]:
DB_FILE = '../../dataset/supervised_dataset.duckdb'

NEW_TABLE_NAME = 'sequenced_feature_table'

SESSION_TIMEOUT = '30 minutes'

TEMP_DIR = 'duckdb_temp'
if not os.path.exists(TEMP_DIR):
    os.makedirs(TEMP_DIR)

#### 2. Creating new feature table with sequence id's

In [11]:
try:
    conn = duckdb.connect(database=DB_FILE, read_only=False)
    print(f"Successfully connected to {DB_FILE}")

    query = f"""
    -- Allow DuckDB to use disk if RAM is full
    SET temp_directory='{TEMP_DIR}';

    -- Optionally increase memory limit if you have more RAM available
    -- SET memory_limit='16GB'; -- Uncomment and adjust if you have more than 12.5GB of RAM

    -- Create the new table, replacing it if it already exists
    CREATE OR REPLACE TABLE {NEW_TABLE_NAME} AS
    WITH TimeDifferences AS (
        SELECT
            user_id,          -- Only select columns you actually need
            timestamp,
            LAG(timestamp, 1, timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) AS previous_timestamp
        FROM
            feature_table
    ),
    NewSequenceFlags AS (
        SELECT
            user_id,
            timestamp,
            CASE
                WHEN (timestamp - previous_timestamp) > INTERVAL '{SESSION_TIMEOUT}' THEN 1
                ELSE 0
            END AS is_new_sequence_flag
        FROM
            TimeDifferences
    ),
    Sequences AS (
        SELECT
            user_id,
            timestamp,
            CONCAT(user_id, '_', SUM(is_new_sequence_flag) OVER (PARTITION BY user_id ORDER BY timestamp)) AS sequence_id
        FROM
            NewSequenceFlags
    )
    -- Final Step: Join the sequence_id back to the original table
    SELECT
        ft.*, -- Select all columns from the original feature_table
        s.sequence_id
    FROM
        feature_table AS ft
    JOIN
        Sequences AS s ON ft.user_id = s.user_id AND ft.timestamp = s.timestamp;
    """

    # --- Execute the Query ---
    print(f"Creating new table '{NEW_TABLE_NAME}' ")
    conn.execute(query)
    print("Table created successfully.")

    # --- Verification Step ---
    print("\n--- Verifying the new table (showing first 10 rows) ---")
    verification_df = conn.execute(
        f"SELECT user_id, timestamp, sequence_id FROM {NEW_TABLE_NAME} ORDER BY user_id, timestamp LIMIT 10;"
    ).fetchdf()
    print(verification_df)

    print(f"\nData with sequence IDs is now in the table '{NEW_TABLE_NAME}'.")

except Exception as e:
    print(f"An error occurred: {e}")

Successfully connected to ../../dataset/supervised_dataset.duckdb
Creating new table 'sequenced_feature_table' 


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Table created successfully.

--- Verifying the new table (showing first 10 rows) ---
   user_id           timestamp sequence_id
0  AAE0190 2010-01-04 08:09:00   AAE0190_0
1  AAE0190 2010-01-04 08:10:50   AAE0190_0
2  AAE0190 2010-01-04 08:12:10   AAE0190_0
3  AAE0190 2010-01-04 08:14:04   AAE0190_0
4  AAE0190 2010-01-04 08:16:00   AAE0190_0
5  AAE0190 2010-01-04 08:19:15   AAE0190_0
6  AAE0190 2010-01-04 08:19:50   AAE0190_0
7  AAE0190 2010-01-04 08:20:14   AAE0190_0
8  AAE0190 2010-01-04 08:26:24   AAE0190_0
9  AAE0190 2010-01-04 08:29:44   AAE0190_0

Data with sequence IDs is now in the table 'sequenced_feature_table'.


In [12]:
conn.close()