In [77]:
import torch
import random
import numpy as np
from src.utils.config import get_small_classifier_config
from src.training.classifier_trainer import SimpleTextDataset, train_classifier, evaluate
import csv, random, time, datetime as dt
import pandas as pd
from pathlib import Path
from typing import Counter
from sklearn.model_selection import train_test_split
from src.utils.char_tokenizer import CharTokenizer
from torch.utils.data import DataLoader

### Model Config

In [33]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

cfg = get_small_classifier_config()
cfg.num_classes = 2  # binary

# Adjust hyper parameters
cfg.learning_rate = 1e-4
cfg.weight_decay = 0.01
cfg.max_epochs = 8

### Data Preprocessing

In [61]:
csv_path = Path("fake-6-weeks.csv")  # adjust if stored elsewhere
df = pd.read_csv(csv_path)

# ensure numeric types
df["rev_usd"] = df["rev_usd"].astype(float)
df["event_timestamp"] = df["event_timestamp"].astype("int64")

display(df.head())
print(df.dtypes)
print(f"Rows: {len(df)}")

Unnamed: 0,user_pseudo_id,session_id,date_formatted,event_timestamp,event_name,rev_usd,unique_items,qty,page_location,page_title
0,u_001,s_001_01_w1,2025-08-25,1756036801000,session_start,0.0,0,0,https://example.com/,Home
1,u_001,s_001_01_w1,2025-08-25,1756036807000,page_view,0.0,0,0,https://example.com/category/equipment,Gym Equipment
2,u_001,s_001_01_w1,2025-08-25,1756036813000,view_item,0.0,1,1,https://example.com/product/sku1001,Foam Roller
3,u_001,s_001_01_w1,2025-08-25,1756036819000,add_to_cart,0.0,1,1,https://example.com/product/sku1001,Foam Roller
4,u_001,s_001_01_w1,2025-08-25,1756036825000,begin_checkout,0.0,1,1,https://example.com/cart,Cart


user_pseudo_id      object
session_id          object
date_formatted      object
event_timestamp      int64
event_name          object
rev_usd            float64
unique_items         int64
qty                  int64
page_location       object
page_title          object
dtype: object
Rows: 900


In [79]:
processed_data = df.copy()

# Convert date_formatted to date time
processed_data["date_formatted"] = pd.to_datetime(processed_data["date_formatted"])

# grab unique user ids
unique_user_ids = df["user_pseudo_id"].unique()
train_data = []
print(unique_user_ids)

for user_id in unique_user_ids:
    user_data = processed_data[processed_data["user_pseudo_id"] == user_id]

    # Count unique mondays
    monday_count = user_data[user_data["date_formatted"].dt.dayofweek == 0]["date_formatted"].nunique()

    event_len = len(user_data)
    # print(f"User ID: {user_id}, Number of Events: {event_len}, Number of Mondays: {monday_count}")
    if event_len < 6:
        continue  # skip users with less than 10 events
    for i in range(6, event_len):
        main_event = user_data.iloc[i]
        # Get start of main_week(monday)
        main_start_of_week = main_event["date_formatted"] - pd.to_timedelta(main_event["date_formatted"].dayofweek, unit='d')
        main_end_of_week = main_start_of_week + pd.DateOffset(days=6)
        pred_start_of_week = main_end_of_week + pd.Timedelta(days=1)
        pred_end_of_week = pred_start_of_week + pd.DateOffset(days=6)
        # check if there is any data for next week to label
        if user_data[user_data["date_formatted"].between(pred_start_of_week, pred_end_of_week)].shape[0] == 0:
            continue  # skip if no data for next week

        context_events = user_data.iloc[:i]

        # Get tagged prediction: if purchase event occurs in the following week return 1, else 0
        get_tagged_prediction = 1 if user_data[user_data["date_formatted"].between(pred_start_of_week, pred_end_of_week) & (user_data["event_name"] == "purchase")].shape[0] > 0 else 0

        # Group context events by session using session_id, not using groupby
        context_sessions = []
        for session_id, group in context_events.groupby("session_id"):
            context_sessions.append({
                "session_id": session_id,
                "events": group[["event_name", "date_formatted", "event_timestamp", "rev_usd", "unique_items", "qty", "page_location", "page_title"]].values.tolist()
            })

        # Sort sessions by oldest to earliest date
        context_sessions = sorted(context_sessions, key=lambda x: x["events"][0][1])

        train_data_record = ""
        current_session_date = context_sessions[0]["events"][0][1]
        first_session = True
        for session in context_sessions:
            if session["events"][0][1] > current_session_date:
                current_session_date = session["events"][0][1]
            if first_session:
                train_data_record += f"Session-{session['session_id']}"
            else:
                train_data_record += f"\n\nSession-{session['session_id']}"
            train_data_record += f"\nDate-{current_session_date.strftime('%Y-%m-%d')}"
            for event in session["events"]:
                if event[1] > current_session_date:
                    current_session_date = event[1]
                    train_data_record += f"\nDate-{current_session_date.strftime('%Y-%m-%d')}"
                # Convert event_timestamp (milliseconds) to HH:MM
                event_time = dt.datetime.fromtimestamp(event[2] / 1000).strftime('%H:%M')
                train_data_record += f"\nevt: {event[0]}, tm: {event_time}, rev: ${event[3]}, uq_itms: {event[4]}, qty: {event[5]}, loc: {event[6]}, title: {event[7]}"

        train_data.append({
            "text": train_data_record,
            "label": get_tagged_prediction
        })

print(f"Training Data Len: {len(train_data)}")
print(f"Distribution Balance: {Counter([d['label'] for d in train_data])}")
print(train_data[0]["text"])
print(f"Label: {train_data[0]['label']}")

train_df = pd.DataFrame(train_data)

texts = train_df["text"].tolist()

char_tok = CharTokenizer()

cfg.vocab_size = char_tok.vocab_size

train_df["text"].apply(lambda x: char_tok.encode(x))

['u_001' 'u_002' 'u_003' 'u_004' 'u_005' 'u_006' 'u_007' 'u_008' 'u_009'
 'u_010' 'u_011' 'u_012' 'u_013' 'u_014' 'u_015' 'u_016' 'u_017' 'u_018'
 'u_019' 'u_020' 'u_021' 'u_022' 'u_023' 'u_024' 'u_025']
Training Data Len: 600
Distribution Balance: Counter({1: 450, 0: 150})
Session-s_001_01_w1
Date-2025-08-25
evt: session_start, tm: 00:00, rev: $0.0, uq_itms: 0, qty: 0, loc: https://example.com/, title: Home
evt: page_view, tm: 00:00, rev: $0.0, uq_itms: 0, qty: 0, loc: https://example.com/category/equipment, title: Gym Equipment
evt: view_item, tm: 00:00, rev: $0.0, uq_itms: 1, qty: 1, loc: https://example.com/product/sku1001, title: Foam Roller
evt: add_to_cart, tm: 00:00, rev: $0.0, uq_itms: 1, qty: 1, loc: https://example.com/product/sku1001, title: Foam Roller
evt: begin_checkout, tm: 00:00, rev: $0.0, uq_itms: 1, qty: 1, loc: https://example.com/cart, title: Cart
evt: purchase, tm: 00:00, rev: $29.49, uq_itms: 1, qty: 1, loc: https://example.com/checkout/complete, title: Order Co

0      [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 56,...
1      [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 56,...
2      [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 56,...
3      [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 56,...
4      [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 56,...
                             ...                        
595    [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 58,...
596    [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 58,...
597    [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 58,...
598    [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 58,...
599    [48, 8, 22, 22, 12, 18, 84, 73, 22, 1, 56, 58,...
Name: text, Length: 600, dtype: object

### Train Test Split

In [80]:
train_ds, val_ds = train_test_split(train_df, test_size=0.2, random_state=42)

### Train and Test Model

In [81]:
model = train_classifier(cfg, train_ds, val_ds)
torch.save(model.state_dict(), "classifier_model.pt")

KeyError: 129