In [1]:
import pandas as pd
import numpy as np

#### Generate random sequences of length 10

In [2]:
np.random.seed(101)
seqs = np.random.choice(10, size=(20000, 10))

#### Generate target based on the sum of fields 1-3 and 4-6, multiplied by fields 8-10. Field 7 is ignored

In [3]:
v = np.sum((seqs[:, :3] + seqs[:, :3:6]) * seqs[:, -3:], axis=1)
target = (v / (np.max(v) / 15)).astype(int)
target[target > 9] = 9

#### Check target label distribution

In [4]:
pd.Series(target).value_counts().sort_index()

0    1590
1    2860
2    3303
3    3120
4    2596
5    2129
6    1577
7    1200
8     710
9     915
Name: count, dtype: int64

#### Create synthetic dataset

In [5]:
data_dict = {
    "sequenceId": np.arange(20000),
    "subsequenceId": np.arange(20000),
    "inputCol": np.repeat("itemId", 20000),
}
for i in range(10):
    data_dict[str(10 - i)] = seqs[:, i]

data_dict["target"] = target
data = pd.DataFrame(data_dict)
data.head(5)

Unnamed: 0,sequenceId,subsequenceId,inputCol,10,9,8,7,6,5,4,3,2,1,target
0,0,0,itemId,1,6,7,9,8,4,8,5,0,5,1
1,1,1,itemId,8,1,3,8,3,3,2,8,9,3,8
2,2,2,itemId,7,0,9,7,9,8,4,3,3,7,5
3,3,3,itemId,4,8,7,6,9,4,2,7,7,7,7
4,4,4,itemId,0,4,1,8,3,1,8,4,3,2,0


In [6]:
data.shape

(20000, 14)

In [7]:
import os
import numpy as np

In [8]:
len(np.unique(data["target"]))

10

#### Write train and test data to separate files

In [9]:
project_path = "/Users/leon.luithlen/psprojects/test-sequifier"
data.iloc[:10000, :].to_csv(
    f"{project_path}/train_data.csv", sep=",", decimal=".", index=None
)
data.iloc[10000:, :].to_csv(
    f"{project_path}/test_data.csv", sep=",", decimal=".", index=None
)

#### Train a model on train data using the sequifier cli and infer on test data

In [10]:
import os

command = f"sequifier --train --on-unprocessed --config-path={project_path}/configs/train.yaml"
print(command)
os.system(command)

sequifier --train --on-unprocessed --config-path=/Users/leon.luithlen/psprojects/test-sequifier/configs/train.yaml


sh: sequifier: command not found


32512

In [42]:
command = f"sequifier --infer --config-path={project_path}/configs/infer.yaml"
print(command)
os.system(command)

sh: sequifier: command not found


32512

#### Load predictions from inference

In [35]:
preds = pd.read_csv(
    f"{project_path}/outputs/predictions/sequifier-default-best-300-predictions.csv"
)

#### Evaluate transformer test set accuracy

In [36]:
np.mean(target[10000:] == preds["model_output"].values)

0.8895

#### Train and evaluate random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(data.iloc[:10000, 3:-1], data.iloc[:10000, -1])
rf_preds = rf.predict(data.iloc[10000:, 3:-1])
np.mean(rf_preds == target[10000:])

#### Train and evaluate XGBoost classifier

In [None]:
import xgboost as xgb

D_train = xgb.DMatrix(
    data.iloc[:10000, 3:-1].values, label=data.iloc[:10000, -1].values
)
D_test = xgb.DMatrix(data.iloc[10000:, 3:-1].values, label=data.iloc[10000:, -1].values)
param = {"eta": 0.3, "max_depth": 5, "objective": "multi:softprob", "num_class": 10}

steps = 1000  # The number of training iterations
model = xgb.train(param, D_train, steps)

xg_preds = model.predict(D_test)
xg_best_preds = np.asarray([np.argmax(line) for line in xg_preds])
np.mean(xg_best_preds == target[10000:])

0.6967

#### Train and evaluate logistic regression

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(data.iloc[:, 3:-1])
lr = LogisticRegression(penalty="l2", max_iter=10000)
lr.fit(X_encoded[:10000, :], target[:10000])
lr_preds = lr.predict(X_encoded[10000:, :])
np.mean(lr_preds == target[10000:])

0.5663