In [1]:
import pandas as pd
import numpy as np

#### Generate random sequences of length 10

In [16]:
np.random.seed(101)
seqs = np.random.choice(10, size=(20000, 10))

In [3]:
seqs2 = np.random.choice(3, size=(20000, 10))

#### Generate target based on the sum of fields 1-3 and 4-6, multiplied by fields 8-10. Field 7 is ignored

In [4]:
v = np.sum(((seqs[:,:3]*seqs2[:,:3]) + (seqs[:,:3:6]*seqs2[:,3:6])) * (seqs[:,-3:]*seqs2[:,-3:]), axis=1)
target = (v/(np.max(v)/15)).astype(int)
target[target>9] = 9
target = sum([[t, np.nan] for t in target ], [])

#### Check target label distribution

In [5]:
pd.Series(target).value_counts().sort_index()

0.0    9485
1.0    4282
2.0    2594
3.0    1528
4.0     924
5.0     555
6.0     281
7.0     179
8.0      82
9.0      90
Name: count, dtype: int64

#### Create synthetic dataset

In [6]:
data_dict = {
    "sequenceId":  np.repeat(np.arange(20000), 2),
    "subsequenceId":  np.repeat(np.arange(20000), 2),
    "inputCol": ["itemId", "sup1"] * 20000
}
for i in range(10):
    data_dict[str(10-i)] = [x for y in list(zip(list(seqs[:, i]), list(seqs2[:,i]))) for x in y]

data_dict["target"] = target
data = pd.DataFrame(data_dict)
data.head(5)

Unnamed: 0,sequenceId,subsequenceId,inputCol,10,9,8,7,6,5,4,3,2,1,target
0,0,0,itemId,1,6,7,9,8,4,8,5,0,5,1.0
1,0,0,sup1,1,1,2,1,1,1,2,1,1,1,
2,1,1,itemId,8,1,3,8,3,3,2,8,9,3,7.0
3,1,1,sup1,2,0,1,2,2,1,1,1,2,1,
4,2,2,itemId,7,0,9,7,9,8,4,3,3,7,0.0


#### Write train and test data to separate files

In [7]:
project_path = "/Users/leon.luithlen/psprojects/test-sequifier"
data.iloc[:10000,:].to_csv(f"{project_path}/train_data.csv", sep=",", decimal=".", index=None)
data.iloc[10000:,:].to_csv(f"{project_path}/test_data.csv", sep=",", decimal=".", index=None)

#### Train a model on train data using the sequifier cli and infer on test data

In [8]:
import os
command = f"sequifier --train --on-unprocessed --config-path={project_path}/configs/train.yaml"
print(command)

sequifier --train --on-unprocessed --config-path=/Users/leon.luithlen/psprojects/test-sequifier/configs/train.yaml


In [9]:
command = f"sequifier --infer --on-unprocessed --config-path={project_path}/configs/infer.yaml"
print(command)

sequifier --infer --on-unprocessed --config-path=/Users/leon.luithlen/psprojects/test-sequifier/configs/infer.yaml


#### Load predictions from inference with two input columns and evaluate test set accuracy

In [14]:
preds = pd.read_csv(f"{project_path}/outputs/predictions/sequifier-default-best-300_predictions.csv")
np.mean(np.array(list(target)[10000::2]) == preds["model_output"].values)

0.7459333333333333

#### Load predictions from inference with ONE input columns and evaluate test set accuracy

In [13]:
preds = pd.read_csv(f"{project_path}/outputs/predictions/sequifier-default-1-col-best-300_predictions.csv")
np.mean(np.array(list(target)[10000::2]) == preds["model_output"].values)

0.46673333333333333