In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Generate random sequences of length 10

In [2]:
np.random.seed(101)
seqs = np.random.choice(10, size=(20000, 10))

#### Generate target based on the sum of fields 1-3 and 4-6, multiplied by fields 8-10. Field 7 is ignored

In [3]:
v = np.sum((seqs[:,:3] + seqs[:,:3:6]) * seqs[:,-3:], axis=1)
target = (v/(np.max(v)/15)).astype(int)
target[target>9] = 9

#### Check target label distribution

In [4]:
pd.Series(target).value_counts().sort_index()

0    1590
1    2860
2    3303
3    3120
4    2596
5    2129
6    1577
7    1200
8     710
9     915
Name: count, dtype: int64

#### Create synthetic dataset

In [5]:
data_dict = {
    "sequenceId": np.arange(20000),
    "subsequenceId": np.arange(20000),
    "inputCol": np.repeat("itemId", 20000)
}
for i in range(10):
    data_dict[str(10-i)] = seqs[:,i]

data_dict["target"] = target
data = pd.DataFrame(data_dict)
data.head(5)

Unnamed: 0,sequenceId,subsequenceId,inputCol,10,9,8,7,6,5,4,3,2,1,target
0,0,0,itemId,1,6,7,9,8,4,8,5,0,5,1
1,1,1,itemId,8,1,3,8,3,3,2,8,9,3,8
2,2,2,itemId,7,0,9,7,9,8,4,3,3,7,5
3,3,3,itemId,4,8,7,6,9,4,2,7,7,7,7
4,4,4,itemId,0,4,1,8,3,1,8,4,3,2,0


#### Write train and test data to separate files

In [6]:
project_path = "/home/leon/projects/test-sequifier"
data.iloc[:10000,:].to_csv(f"{project_path}/train_data.csv", sep=",", decimal=".", index=None)
data.iloc[10000:,:].to_csv(f"{project_path}/test_data.csv", sep=",", decimal=".", index=None)

#### Train a model on train data using the sequifier cli and infer on test data

In [None]:
# train.yaml
# 
# project_path: "/home/leon/projects/test-sequifier"
# #metadata
# n_classes: 10 # number of classes
# training_data_path: "/home/leon/projects/test-sequifier/train_data.csv" # absolute path to training data
# validation_data_path: "/home/leon/projects/test-sequifier/train_data.csv" # absolute path to validation data
# 
# target_column: itemId
# target_column_type: categorical
# column_types: {
#   itemId: int64
# }
# n_classes: {
#   itemId: 10
# }
# categorical_columns: ["itemId"]
# real_columns: []
# 
# model_name: "default"  # model name to load from in case there are checkpoints of that model available, can be None
# #data specification
# seq_length: 10 # length of sequence used for classification, cannot be larger than sew_length in the preprocessing step
# log_interval: 100
# #model specification
# model_spec:
#   d_model: 50 # dimensionality of the token embedding system
#   nhead: 2 # number of attention heads within each transformer layer
#   d_hid: 50 # dimensionality of feedforward network inside transformer layer
#   nlayers: 2 # number of transformer layers
# 
# #training specification
# training_spec:
#   device: "cpu" # device for model training
#   epochs: 500 # number of epochs
#   iter_save: 50 # frequency of checkpointing
#   batch_size: 50 # batch size for training
#   lr: 0.0005  # learning rate
#   dropout: 0.3 # dropout rate during training
#   criterion: "CrossEntropyLoss" # loss function, can be any in torch.nn
#   optimizer: 
#     name: "Adam" # optimizer, can be any on torch.optim
#   scheduler:
#     name: "StepLR" # learning rate scheduler, can be any in torch.optim.lr_scheduler
#     step_size: 1.0
#     gamma: 0.993
#   continue_training: true

In [None]:
import os
os.system(
    f"sequifier --train --config-path={project_path}/configs/train.yaml"
)

In [49]:
# infer.yaml
# project_path: "/home/leon/projects/test-sequifier"
# #data driven config path
# target_column: itemId
# target_column_type: categorical
# column_types: {
#   itemId: int64
# }
# n_classes: {
#   itemId: 10
# }
# categorical_columns: ["itemId"]
# real_columns: []
# 
# inference_data_path: "/home/leon/projects/test-sequifier/test_data.csv" # path to validation data (within project folder)
# model_path: "models/sequifier-default-best.onnx" # path to model (within project folder)
# batch_size: 50
# device: "cpu" # device used for inference
# seq_length: 10 # sequence length for prediction (must be identical to training)
# output_probabilities: True # write out class probablities for further processing
# ddconfig_path: "configs/ddconfigs/chr8.json" # data driven config path, or path to any json that contains {{'id_map':{label1:index1, ..., }}}, can be none if map_to_id is false
# map_to_id: False # map predictions from indices to labels (requires ddconfig_path)


In [38]:
os.system(
f"sequifier --infer --config-path={project_path}/configs/infer.yaml"
)

Inferring for sequifier-default-best


  np.sum(np.exp(ort_outs), axis=1), ort_outs.shape[1]
  probs = np.exp(ort_outs) / normalizer
  probs = np.exp(ort_outs) / normalizer
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Writing probabilities to /home/leon/projects/test-sequifier/outputs/probabilities/sequifier-default-best_probabilities.csv
Writing predictions to /home/leon/projects/test-sequifier/outputs/predictions/sequifier-default-best_predictions.csv
Inference complete


0

#### Load predictions from inference

In [39]:
preds = pd.read_csv(f"{project_path}/outputs/predictions/sequifier-default-best_predictions.csv")

#### Evaluate transformer test set accuracy

In [40]:
np.mean(target[10000:] == preds["0"].values)

0.8469

#### Train and evaluate random forest classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(data.iloc[:10000,3:-1], data.iloc[:10000,-1])
rf_preds = rf.predict(data.iloc[10000:,3:-1])
np.mean(rf_preds==target[10000:])

0.6419

#### Train and evaluate XGBoost classifier

In [42]:
import xgboost as xgb
D_train = xgb.DMatrix(data.iloc[:10000,3:-1].values, label=data.iloc[:10000,-1].values)
D_test = xgb.DMatrix(data.iloc[10000:,3:-1].values, label=data.iloc[10000:,-1].values)
param = {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10
} 

steps = 1000  # The number of training iterations
model = xgb.train(param, D_train, steps)

xg_preds = model.predict(D_test)
xg_best_preds = np.asarray([np.argmax(line) for line in xg_preds])
np.mean(xg_best_preds==target[10000:])

0.6967

#### Train and evaluate logistic regression

In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(data.iloc[:,3:-1])
lr = LogisticRegression(penalty='l2', max_iter=10000)
lr.fit(X_encoded[:10000,:], target[:10000])
lr_preds = lr.predict(X_encoded[10000:,:])
np.mean(lr_preds==target[10000:])

0.5663