In [1]:
import os
import yaml
import numpy as np
import pandas as pd
import torch

In [2]:
PROJECT_ROOT = "."
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
CONFIG_DIR = os.path.join(PROJECT_ROOT, "configs")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CONFIG_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
n_sequences = 100
seq_len = 128

records = []
for seq_id in range(n_sequences):
    start_num = np.random.randint(0, 5)
    for t in range(seq_len):
        val = ((start_num + t) % 5) + 1
        records.append(
            {
                "sequenceId": seq_id,
                "itemPosition": t,
                "targetToken": val,  # The categorical variable we want to predict
            }
        )

df = pd.DataFrame(records)
# Sequifier expects a specific format, usually csv or parquet
file_path = os.path.join(DATA_DIR, "synthetic-sequences.parquet")
df.to_parquet(file_path)

In [4]:
def write_yaml(path, data):
    with open(path, "w") as f:
        yaml.dump(data, f, default_flow_style=False)

In [5]:
preprocess_config = {
    "project_root": PROJECT_ROOT,
    "data_path": file_path,
    "read_format": "parquet",
    "write_format": "parquet",
    "selected_columns": ["targetToken"],
    "split_ratios": [0.8, 0.1, 0.1],  # Train, Val, Test
    "seq_length": 10,  # Lookback window
    "stride_by_split": [1, 1, 1],  # Dense sampling
    "max_rows": None,
    "seed": 42,
}

pp_conf_path = os.path.join(CONFIG_DIR, "preprocess.yaml")
write_yaml(pp_conf_path, preprocess_config)

In [6]:
train_config = {
    "project_root": PROJECT_ROOT,
    "model_name": "demo-cycle-model",
    "read_format": "parquet",
    # Metadata path is automatically generated by preprocessing based on data filename
    "metadata_config_path": "configs/metadata_configs/synthetic-sequences.json",
    "input_columns": ["targetToken"],
    "target_columns": ["targetToken"],
    "target_column_types": {"targetToken": "categorical"},
    "seq_length": 10,
    "inference_batch_size": 10,
    "export_generative_model": True,
    "export_embedding_model": False,
    "export_onnx": False,  # Keep false for simple PyTorch demo
    "export_pt": True,
    "export_with_dropout": False,
    "model_spec": {
        "initial_embedding_dim": 16,
        # Explicitly map column to embedding size
        "feature_embedding_dims": {"targetToken": 16},
        "joint_embedding_dim": None,
        "dim_model": 16,
        "n_head": 2,
        "dim_feedforward": 32,
        "num_layers": 1,  # Tiny model for speed
        "prediction_length": 1,
    },
    "training_spec": {
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "epochs": 5,
        "save_interval_epochs": 5,
        "batch_size": 32,
        "log_interval": 10000,
        "learning_rate": 0.001,
        "criterion": {"targetToken": "CrossEntropyLoss"},
        "optimizer": {"name": "Adam"},
        "scheduler": {"name": "StepLR", "step_size": 1, "gamma": 0.99},
        "continue_training": False,
    },
}

tr_conf_path = os.path.join(CONFIG_DIR, "train.yaml")
write_yaml(tr_conf_path, train_config)

In [7]:
infer_config = {
    "project_root": PROJECT_ROOT,
    "metadata_config_path": "configs/metadata_configs/synthetic-sequences.json",
    "model_type": "generative",
    # Points to the model exported by the training step
    "model_path": "models/sequifier-demo-cycle-model-best-5.pt",
    # Use the 'test' split created during preprocessing
    "data_path": "data/synthetic-sequences-split2.parquet",
    "write_format": "parquet",
    "input_columns": ["targetToken"],
    "target_columns": ["targetToken"],
    "target_column_types": {"targetToken": "categorical"},
    "output_probabilities": False,
    "map_to_id": True,  # Map integer classes back to original values (e.g., "1", "2")
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seq_length": 10,
    "inference_batch_size": 10,
    "autoregression": True,
    "autoregression_extra_steps": 5,  # Predict 5 steps into the future
}

inf_conf_path = os.path.join(CONFIG_DIR, "infer.yaml")
write_yaml(inf_conf_path, infer_config)

In [8]:
os.system(f"sequifier preprocess --config-path {CONFIG_DIR}/preprocess.yaml")

2025-11-24 18:54:28.501 | INFO     | sequifier.preprocess:preprocess:37 - --- Starting Preprocessing ---
2025-11-24 18:54:28.502 | INFO     | sequifier.preprocess:_load_and_preprocess_data:898 - Reading data from './data/synthetic-sequences.parquet'...
2025-11-24 18:54:32.223 | INFO     | sequifier.preprocess:combine_multiprocessing_outputs:2045 - writing to: ./data/temp/synthetic-sequences-split0-0.parquet
2025-11-24 18:54:32.252 | INFO     | sequifier.preprocess:combine_multiprocessing_outputs:2045 - writing to: ./data/temp/synthetic-sequences-split0-2.parquet
2025-11-24 18:54:32.255 | INFO     | sequifier.preprocess:combine_multiprocessing_outputs:2045 - writing to: ./data/temp/synthetic-sequences-split0-1.parquet
2025-11-24 18:54:32.321 | INFO     | sequifier.preprocess:combine_multiprocessing_outputs:2045 - writing to: ./data/temp/synthetic-sequences-split0-4.parquet
2025-11-24 18:54:32.334 | INFO     | sequifier.preprocess:combine_multiprocessing_outputs:2045 - writing to: ./data

0

In [9]:
os.system(f"sequifier train --config-path {CONFIG_DIR}/train.yaml")

2025-11-24 18:54:35.480 | INFO     | sequifier.io.sequifier_dataset_from_file:__init__:32 - [INFO] Loading training dataset into memory from './data/synthetic-sequences-split0.parquet'...
2025-11-24 18:54:35.484 | INFO     | sequifier.io.sequifier_dataset_from_file:__init__:67 - [INFO] Dataset loaded with 9200 samples.
2025-11-24 18:54:35.484 | INFO     | sequifier.io.sequifier_dataset_from_file:__init__:32 - [INFO] Loading training dataset into memory from './data/synthetic-sequences-split1.parquet'...
2025-11-24 18:54:35.486 | INFO     | sequifier.io.sequifier_dataset_from_file:__init__:67 - [INFO] Dataset loaded with 300 samples.
2025-11-24 18:54:35 | --- Starting Training for model: demo-cycle-model ---
2025-11-24 18:54:35 | self.categorical_columns = ['targetToken']
2025-11-24 18:54:35 | self.real_columns = []
2025-11-24 18:54:35 | [INFO] Initializing new model with  3.05e3 parameters.
2025-11-24 18:54:36 | --------------------------------------------------------------------------

0

In [10]:
os.system(f"sequifier infer --config-path {CONFIG_DIR}/infer.yaml")

2025-11-24 18:54:39.939 | INFO     | sequifier.infer:infer:50 - --- Starting Inference ---
2025-11-24 18:54:39.941 | INFO     | sequifier.infer:infer_worker:145 - [INFO] Reading data from './data/synthetic-sequences-split2.parquet'...
2025-11-24 18:54:39 | --- Starting Training for model: demo-cycle-model ---
2025-11-24 18:54:39 | self.categorical_columns = ['targetToken']
2025-11-24 18:54:39 | self.real_columns = []
2025-11-24 18:54:39 | [INFO] Initializing new model with  3.05e3 parameters.
2025-11-24 18:54:39 | --- Starting Training for model: demo-cycle-model ---
2025-11-24 18:54:39 | self.categorical_columns = ['targetToken']
2025-11-24 18:54:39 | self.real_columns = []
2025-11-24 18:54:39 | [INFO] Initializing new model with  3.05e3 parameters.
2025-11-24 18:54:39 | [INFO] Loading model weights from ./models/sequifier-demo-cycle-model-best-5.pt
2025-11-24 18:54:39 | [INFO] Inferring for sequifier-demo-cycle-model-best-5
2025-11-24 18:54:40 | [INFO] Writing predictions to './outpu

0

In [11]:
pred_path = os.path.join(
    OUTPUT_DIR, "predictions", "sequifier-demo-cycle-model-best-5-predictions.parquet"
)

preds = pd.read_parquet(pred_path)

preds["targetToken"] = preds["targetToken"].astype(int)

print(f"Loaded predictions from: {pred_path}")
print(preds.head(3))

Loaded predictions from: ./outputs/predictions/sequifier-demo-cycle-model-best-5-predictions.parquet
   sequenceId  itemPosition  targetToken
0           0            10            2
1           0            11            3
2           0            12            4


In [12]:
seq_vals = preds["sequenceId"].to_numpy()
token_vals = preds["targetToken"].to_numpy()

# We expect the predicted token to either be +1 the previous value, start at random if a new sequence
# has been reached, or be equal to 1 if the previous value was 5. Here, we check for each criterion
# if it applies
sequence_id_change = np.concatenate([[0], (seq_vals[1:] != seq_vals[:-1]).astype(int)])
correct_increment = np.concatenate(
    [[1], (token_vals[1:] == token_vals[:-1] + 1).astype(int)]
)
last_was_five = np.concatenate([[1], (token_vals[:-1] == 5).astype(int)])
is_one = token_vals == 1

In [13]:
# Lets see what percentage of *predicted* tokens conforms to this rule

mean_accuracy = (
    sequence_id_change | correct_increment | (last_was_five & is_one)
).mean()

print(f"In {mean_accuracy*100:.2f}% of cases, the correct id was predicted")

In 100.00% of cases, the correct id was predicted


In [14]:
print("Deleting project files and folders")

deletion_string = "rm -rf checkpoints configs data logs models outputs"
assert "." not in deletion_string, "DO NOT OVERRIDE"
os.system(deletion_string)

Deleting project files and folders


0