In [1]:
import pandas as pd
import numpy as np

#### Generate random sequences of length 10

In [2]:
np.random.seed(101)
seqs = np.random.choice(10, size=(20000, 10))

#### Generate target based on the sum of fields 1-3 and 4-6, multiplied by fields 8-10. Field 7 is ignored

In [3]:
v = np.sum((seqs[:,:3] + seqs[:,:3:6]) * seqs[:,-3:], axis=1)
target = (v/(np.max(v)/15)).astype(int)
target[target>9] = 9

#### Check target label distribution

In [4]:
pd.Series(target).value_counts().sort_index()

0    1590
1    2860
2    3303
3    3120
4    2596
5    2129
6    1577
7    1200
8     710
9     915
dtype: int64

In [5]:
sequenceId = np.arange(20000)

#### Create synthetic dataset

In [6]:
data_dict = {
    "sequenceId":sequenceId
}
for i in range(10):
    data_dict[str(10-i)] = seqs[:,i]

data_dict["target"] = target
data = pd.DataFrame(data_dict)
data.head(5)

Unnamed: 0,sequenceId,10,9,8,7,6,5,4,3,2,1,target
0,0,1,6,7,9,8,4,8,5,0,5,1
1,1,8,1,3,8,3,3,2,8,9,3,8
2,2,7,0,9,7,9,8,4,3,3,7,5
3,3,4,8,7,6,9,4,2,7,7,7,7
4,4,0,4,1,8,3,1,8,4,3,2,0


#### Write train and test data to separate files

In [7]:
project_path = "/home/leon/projects/test-sequifier"
data.iloc[:10000,:].to_csv(f"{project_path}/train_data.csv", sep=",", decimal=".", index=None)
data.iloc[10000:,:].to_csv(f"{project_path}/test_data.csv", sep=",", decimal=".", index=None)

#### Train a model on train data using the sequifier cli and infer on test data

#### Load predictions from inference

In [8]:
preds = pd.read_csv(f"{project_path}/outputs/predictions/sequifier-default-best_predictions.csv")

#### Evaluate transformer test set accuracy

In [9]:
np.mean(target[10000:] == preds["0"].values)

0.8503

#### Train and evaluate random forest classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(data.iloc[:10000,1:-1], data.iloc[:10000,-1])
rf_preds = rf.predict(data.iloc[10000:,1:-1])
np.mean(rf_preds==target[10000:])

0.6419

#### Train and evaluate XGBoost classifier

In [11]:
import xgboost as xgb
D_train = xgb.DMatrix(data.iloc[:10000,1:-1].values, label=data.iloc[:10000,-1].values)
D_test = xgb.DMatrix(data.iloc[10000:,1:-1].values, label=data.iloc[10000:,-1].values)
param = {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10
} 

steps = 1000  # The number of training iterations
model = xgb.train(param, D_train, steps)

xg_preds = model.predict(D_test)
xg_best_preds = np.asarray([np.argmax(line) for line in xg_preds])
np.mean(xg_best_preds==target[10000:])



0.6967

#### Train and evaluate logistic regression

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(data.iloc[:,1:-1])
lr = LogisticRegression(penalty='none', max_iter=10000)
lr.fit(X_encoded[:10000,:], target[:10000])
lr_preds = lr.predict(X_encoded[10000:,:])
np.mean(lr_preds==target[10000:])

0.6359