#### 1. Importing libraries and initial configurations

In [11]:
import duckdb

In [12]:
DB_FILE = '../../dataset/supervised_dataset.duckdb'
SOURCE_TABLE = 'sequenced_feature_table'
LABELED_TABLE_NAME = 'labeled_sequences'

#### 2. Joining insiders and sequenced_feature_table to create a labels table 

In [13]:
try:
    conn = duckdb.connect(database=DB_FILE, read_only=False)
    print(f"Successfully connected to {DB_FILE}")

    #--- SQL query to join and create the final labelled table. --- 
    query = f"""
    CREATE OR REPLACE TABLE {LABELED_TABLE_NAME} AS
    WITH MaliciousSequenceIDs AS (
        -- Step 1: Find all sequence_ids that have at least one malicious event.
        -- An event is malicious if its user and timestamp match an entry in the insiders table.
        SELECT DISTINCT
            sft.sequence_id
        FROM
            {SOURCE_TABLE} AS sft
        INNER JOIN
            insiders AS i ON sft.user_id = i.user
        WHERE
            sft.timestamp BETWEEN i.start AND i.end
    )
    -- Step 2: Assign a label to EVERY unique sequence.
    SELECT
        DISTINCT sequence_id,
        -- Use a CASE statement to assign the label.
        -- If the sequence_id is in our list of malicious IDs, label it 1, otherwise 0.
        CASE
            WHEN sequence_id IN (SELECT sequence_id FROM MaliciousSequenceIDs) THEN 1
            ELSE 0
        END AS label
    FROM
        {SOURCE_TABLE};
    """

    # --- Execute the Query ---
    print(f"Generating labels and creating new table '{LABELED_TABLE_NAME}'...")
    conn.execute(query)
    print("Table created successfully.")

    # --- Verification Step ---
    print("\n--- Verifying the new table (showing first 10 rows) ---")
    verification_df = conn.execute(
        f"SELECT * FROM {LABELED_TABLE_NAME} LIMIT 10;"
    ).fetchdf()
    print(verification_df)

    print("\n--- Verifying the label distribution (IMPORTANT!) ---")
    label_distribution = conn.execute(
        f"SELECT label, COUNT(sequence_id) AS count FROM {LABELED_TABLE_NAME} GROUP BY label;"
    ).fetchdf()
    print(label_distribution)
    print("\nThis shows the class imbalance you will need to handle in your model.")

    print(f"\nThe labels are now available in the table '{LABELED_TABLE_NAME}'.")

except Exception as e:
    print(f"An error occurred: {e}")


Successfully connected to ../../dataset/supervised_dataset.duckdb
Generating labels and creating new table 'labeled_sequences'...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Table created successfully.

--- Verifying the new table (showing first 10 rows) ---
    sequence_id  label
0  MIM0712_2103      0
1  MJB0588_1187      0
2  MJB0588_1538      0
3   MJM0510_135      0
4   MJM0510_189      0
5  MJM0510_1364      0
6  MJM0510_1459      0
7  MJS0890_1086      0
8   MKL0941_148      0
9   MKL0941_425      0

--- Verifying the label distribution (IMPORTANT!) ---
   label    count
0      0  1437450
1      1     5738

This shows the class imbalance you will need to handle in your model.

The labels are now available in the table 'labeled_sequences'.


In [14]:
conn.close() 