In [1]:
import pandas as pd
import numpy as np

#### Generate random sequences of length 10

In [2]:
np.random.seed(101)
seqs = np.random.choice(10, size=(25000, 10))

In [3]:
seqs1 = np.random.choice(6, size=(25000, 10))
seqs2 = np.random.uniform(size=(25000, 10)) + 2.0

#### Generate target based on the sum of fields 1-3 and 4-6, multiplied by fields 8-10. Field 7 is ignored

In [4]:
v = np.sum(
    ((seqs[:, :3] * seqs2[:, :3]) + (seqs[:, :3:6] * seqs2[:, 3:6]))
    * (seqs[:, -3:] * seqs2[:, -3:]),
    axis=1,
)
target = (v / (np.max(v) / 15)).astype(int)
target[target > 9] = 9

v1 = np.sum(
    ((seqs1[:, :3] * seqs2[:, :3]) + (seqs1[:, :3:6] * seqs2[:, 3:6]))
    * (seqs1[:, -3:] * seqs2[:, -3:]),
    axis=1,
)
target1 = (v1 / (np.max(v) / 15)).astype(int)
target1[target1 > 9] = 9

target2 = list((np.array(target))/10 + np.abs(np.random.randn(len(target))) * 0.2)
target_all = sum([[t, t1, t2] for t, t1, t2 in zip(target, target1, target2)], [])

In [5]:
pd.Series(target).value_counts()

2    4494
1    4044
3    3929
4    3254
5    2524
0    2342
6    1807
7    1129
9     745
8     732
Name: count, dtype: int64

In [6]:
pd.Series(target1).value_counts()

0    12157
1     8998
2     3192
3      626
4       27
Name: count, dtype: int64

#### Create synthetic dataset

In [7]:
data_dict = {
    "sequenceId": np.repeat(np.arange(25000), 3),
    "subsequenceId": np.repeat(np.arange(25000), 3),
    "inputCol": ["itemId", "sup1", "sup2"] * 25000,
}
for i in range(10):
    data_dict[str(10 - i)] = [
        x for y in list(zip(list(seqs[:, i]), list(seqs1[:, i]), list(seqs2[:, i]))) for x in y
    ]

data_dict["target"] = target_all
print([(k, len(v)) for k, v in data_dict.items()])
data = pd.DataFrame(data_dict)
data.head(20)

[('sequenceId', 75000), ('subsequenceId', 75000), ('inputCol', 75000), ('10', 75000), ('9', 75000), ('8', 75000), ('7', 75000), ('6', 75000), ('5', 75000), ('4', 75000), ('3', 75000), ('2', 75000), ('1', 75000), ('target', 75000)]


Unnamed: 0,sequenceId,subsequenceId,inputCol,10,9,8,7,6,5,4,3,2,1,target
0,0,0,itemId,1.0,6.0,7.0,9.0,8.0,4.0,8.0,5.0,0.0,5.0,1.0
1,0,0,sup1,5.0,3.0,1.0,3.0,3.0,1.0,2.0,4.0,2.0,5.0,2.0
2,0,0,sup2,2.333833,2.846748,2.728355,2.170463,2.555819,2.519502,2.820865,2.087095,2.379054,2.352205,0.432944
3,1,1,itemId,8.0,1.0,3.0,8.0,3.0,3.0,2.0,8.0,9.0,3.0,7.0
4,1,1,sup1,0.0,0.0,5.0,5.0,4.0,3.0,2.0,5.0,3.0,4.0,0.0
5,1,1,sup2,2.714754,2.512596,2.497974,2.300398,2.256946,2.251153,2.909222,2.845747,2.374142,2.665901,0.758255
6,2,2,itemId,7.0,0.0,9.0,7.0,9.0,8.0,4.0,3.0,3.0,7.0,5.0
7,2,2,sup1,2.0,4.0,2.0,1.0,2.0,1.0,4.0,0.0,4.0,2.0,0.0
8,2,2,sup2,2.514265,2.546521,2.567427,2.892879,2.410716,2.304859,2.111743,2.998688,2.349367,2.621257,0.570808
9,3,3,itemId,4.0,8.0,7.0,6.0,9.0,4.0,2.0,7.0,7.0,7.0,7.0


#### Write train and test data to separate files

In [8]:
print(data.shape)
project_path = "/Users/leonluithlen/projects/test-sequifier"
data.iloc[:60000, :].to_csv(
    f"{project_path}/train_data.csv", sep=",", decimal=".", index=None
)
data.iloc[60000:, :].to_csv(
    f"{project_path}/test_data.csv", sep=",", decimal=".", index=None
)

(75000, 14)


#### Train a model on train data using the sequifier cli and infer on test data

In [9]:
import os

command = f"sequifier --train --on-unprocessed --config-path={project_path}/configs/train.yaml"
print(command)

sequifier --train --on-unprocessed --config-path=/Users/leonluithlen/projects/test-sequifier/configs/train.yaml


In [10]:
command = f"sequifier --infer --on-unprocessed --config-path={project_path}/configs/infer.yaml"
print(command)

sequifier --infer --on-unprocessed --config-path=/Users/leonluithlen/projects/test-sequifier/configs/infer.yaml


#### Load predictions from inference with two input columns and evaluate test set accuracy

In [11]:
data.shape

(75000, 14)

In [25]:
def asses_predictions(project_path, path, target, start, step, categorical):
    preds = pd.read_csv(
        f"{project_path}/{path}"
    )
    target_array = np.array(list(target)[start::step])
    preds_array = preds['model_output'].values
    if categorical:
        print(f"mean precision: {np.mean(target_array == preds_array )}")
    else:
        print(f"MSE: {(np.mean(((target_array - preds_array) ** 2)))}")

In [29]:
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-1_itemId-predictions.csv", target_all, 60000, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-1_sup1-predictions.csv", target_all, 60001, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-1_sup2-predictions.csv", target_all, 60002, 3, False)

mean precision: 0.1624
mean precision: 0.4976
MSE: 0.07580018095270716


In [27]:
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-10_itemId-predictions.csv", target_all, 60000, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-10_sup1-predictions.csv", target_all, 60001, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-10_sup2-predictions.csv", target_all, 60002, 3, False)

mean precision: 0.6538
mean precision: 0.8394
MSE: 0.01970423774336012


In [28]:
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-50_itemId-predictions.csv", target_all, 60000, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-50_sup1-predictions.csv", target_all, 60001, 3, True)
asses_predictions(project_path, "outputs/predictions/sequifier-multitarget-best-50_sup2-predictions.csv", target_all, 60002, 3, False)

mean precision: 0.6884
mean precision: 0.864
MSE: 0.017207961029796284
