In [3]:
!pip install pytorch_tabnet -q
!pip install tab_transformer_pytorch -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m835.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from pytorch_tabnet.tab_model import TabNetClassifier
from tab_transformer_pytorch import TabTransformer

import joblib

## Read and preprocess the data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_parquet("/content/drive/MyDrive/datasets/reddit_parser_2024_12_06_prices.parquet")

In [8]:
df2 = pd.read_parquet('/content/drive/MyDrive/datasets/reddit_parser_2025_03_10_processed.parquet')

In [9]:
df.shape, df2.shape

((8416, 21), (881, 21))

In [10]:
def preprocess_data(df):
    df['target'] = np.where(df['price_1d'] > df['created_price'], 1, 0)

    drop_cols = ['id', 'title', 'url', 'created_utc', 'parsed_utc',
                 'text', 'parent_id', 'clean_text', 'processed_text',
                 'entities', 'tickers', 'price_1d', 'doc_embedding']

    df = df.drop(columns=drop_cols, errors='ignore')

    df.rename(columns={"processed_text_length": "text_length"}, errors='ignore', inplace=True)

    df = df.fillna(0)

    return df

df = preprocess_data(df)
df2 = preprocess_data(df2)

In [11]:
df.head()

Unnamed: 0,subreddit,score,num_comments,type,text_length,sentiment_scores,ticker,created_price,target
0,investing,3,0.0,comment,594,-0.0772,PATH,14.94,0
1,investing,1,0.0,comment,12,0.0,KAR,20.25,0
2,investing,4,0.0,comment,433,-0.6652,QQQ,521.799988,1
3,investing,102,92.0,submission,1362,-0.9131,AI,37.490002,1
4,investing,1,0.0,comment,332,-0.128,AI,37.490002,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8416 entries, 0 to 8415
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subreddit         8416 non-null   object 
 1   score             8416 non-null   int64  
 2   num_comments      8416 non-null   float64
 3   type              8416 non-null   object 
 4   text_length       8416 non-null   int64  
 5   sentiment_scores  8416 non-null   float64
 6   ticker            8416 non-null   object 
 7   created_price     8416 non-null   float64
 8   target            8416 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 591.9+ KB


In [13]:
df = pd.concat([df, df2], axis=0)

# MLP

In [None]:
class TabularDataset(Dataset):

    def __init__(self, X_num: np.ndarray, X_cat: np.ndarray, y: np.ndarray):
        self.X_num = torch.from_numpy(X_num).float()
        self.X_cat = torch.from_numpy(X_cat).long()
        self.y = torch.from_numpy(y).float().unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

In [None]:
class TabularMLP(nn.Module):
    def __init__(self,
                 num_numeric: int,
                 cat_cardinalities: list,
                 embedding_dims: list,
                #  hidden_layers: list = [128, 64, 32],
                #  hidden_layers: list = [64, 32],
                 hidden_layers: list = [64, 32, 16],
                 dropout: float = 0.4):
        super().__init__()

        assert len(cat_cardinalities) == len(embedding_dims)

        self.embeddings = nn.ModuleList([
            nn.Embedding(card, dim)
            for card, dim in zip(cat_cardinalities, embedding_dims)
        ])
        emb_out_dim = sum(embedding_dims)
        input_dim = num_numeric + emb_out_dim

        layers = []
        in_dim = input_dim
        for h_dim in hidden_layers:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, 1))

        self.model = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        # numeric: (batch, num_numeric)
        # cat: (batch, num_cats)
        embedded = []
        for i, emb in enumerate(self.embeddings):
            embedded.append(emb(x_cat[:, i]))
        x = torch.cat(embedded + [x_num], dim=1)
        logits = self.model(x)
        prob = torch.sigmoid(logits)
        return prob

In [None]:
num_features = ['score', 'num_comments', 'text_length', 'sentiment_scores', 'created_price']
cat_features = ['subreddit', 'type', 'ticker']


cat_maps = {}
for col in cat_features:
    df[col] = df[col].astype('category')
    cat_maps[col] = df[col].cat.categories
    df[col] = df[col].cat.codes

In [None]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df[num_features].values)
X_cat = df[cat_features].values
y = df['target'].values

In [None]:
X_num_train, X_num_val, X_cat_train, X_cat_val, y_train, y_val = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=69, stratify=y)

In [None]:
train_ds = TabularDataset(X_num_train, X_cat_train, y_train)
val_ds   = TabularDataset(X_num_val,   X_cat_val,   y_val)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False)

In [None]:
cat_cardinalities = [len(cat_maps[col]) for col in cat_features]

# choose embedding dims: e.g. min(50, (card+1)//2)
embedding_dims = [min(50, (card+1)//2) for card in cat_cardinalities]

model = TabularMLP(
    num_numeric=len(num_features),
    cat_cardinalities=cat_cardinalities,
    embedding_dims=embedding_dims,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3, verbose=True)



In [None]:
NUM_EPOCHS = 50

for epoch in range(1, NUM_EPOCHS + 1):

    model.train()
    train_loss = 0.0
    all_preds, all_labels = [], []

    for xb_num, xb_cat, yb in train_loader:
        xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
        preds = model(xb_num, xb_cat)
        loss = criterion(preds, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * xb_num.size(0)
        preds_bin = (preds.detach().cpu().numpy() > 0.5).astype(int).flatten()
        all_preds.extend(preds_bin.tolist())
        all_labels.extend(yb.cpu().numpy().astype(int).flatten().tolist())

    train_loss /= len(train_ds)
    train_acc = accuracy_score(all_labels, all_preds)
    train_f1  = f1_score(all_labels, all_preds)
    print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")

    if epoch % 10 == 0:
        model.eval()
        val_loss = 0.0
        val_preds, val_labels = [], []
        with torch.no_grad():
            for xb_num, xb_cat, yb in val_loader:
                xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
                preds = model(xb_num, xb_cat)
                val_loss += criterion(preds, yb).item() * xb_num.size(0)
                preds_bin = (preds.cpu().numpy() > 0.5).astype(int).flatten()
                val_preds.extend(preds_bin.tolist())
                val_labels.extend(yb.cpu().numpy().astype(int).flatten().tolist())

        val_loss /= len(val_ds)
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1  = f1_score(val_labels, val_preds)
        print(f"----- Validation @Epoch {epoch} | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f} -----")

        scheduler.step(val_loss)


torch.save({
    'model_state_dict': model.state_dict(),
    'scaler': scaler,
    'cat_maps': cat_maps
}, 'tabular_mlp.pth')

Epoch 1 | Train Loss: 0.7022 | Acc: 0.5147 | F1: 0.4310
Epoch 2 | Train Loss: 0.6954 | Acc: 0.5233 | F1: 0.4644
Epoch 3 | Train Loss: 0.6884 | Acc: 0.5335 | F1: 0.4591
Epoch 4 | Train Loss: 0.6877 | Acc: 0.5362 | F1: 0.4721
Epoch 5 | Train Loss: 0.6856 | Acc: 0.5510 | F1: 0.4886
Epoch 6 | Train Loss: 0.6823 | Acc: 0.5634 | F1: 0.4943
Epoch 7 | Train Loss: 0.6818 | Acc: 0.5596 | F1: 0.5010
Epoch 8 | Train Loss: 0.6823 | Acc: 0.5564 | F1: 0.4984
Epoch 9 | Train Loss: 0.6790 | Acc: 0.5680 | F1: 0.5005
Epoch 10 | Train Loss: 0.6762 | Acc: 0.5735 | F1: 0.5148
----- Validation @Epoch 10 | Loss: 0.6855 | Acc: 0.5382 | F1: 0.4991 -----
Epoch 11 | Train Loss: 0.6749 | Acc: 0.5762 | F1: 0.5269
Epoch 12 | Train Loss: 0.6726 | Acc: 0.5826 | F1: 0.5263
Epoch 13 | Train Loss: 0.6656 | Acc: 0.5930 | F1: 0.5514
Epoch 14 | Train Loss: 0.6654 | Acc: 0.5932 | F1: 0.5521
Epoch 15 | Train Loss: 0.6586 | Acc: 0.6031 | F1: 0.5772
Epoch 16 | Train Loss: 0.6534 | Acc: 0.6097 | F1: 0.5798
Epoch 17 | Train Loss:

# TabNet


In [None]:
target_col = 'target'
categorical_features = ['subreddit', 'type', 'ticker']
feature_cols = ['score', 'num_comments', 'text_length', 'sentiment_scores', 'created_price'] + categorical_features
numerical_features = [col for col in feature_cols if col not in categorical_features]

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [None]:
y = df[target_col].values
X = df[feature_cols].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=69, stratify=y
)

In [None]:
scaler = StandardScaler()

num_idxs = [feature_cols.index(col) for col in numerical_features]

X_train_num = scaler.fit_transform(X_train[:, num_idxs])
X_val_num = scaler.transform(X_val[:, num_idxs])

X_train_pre = np.hstack([X_train_num, X_train[:, len(numerical_features):]])
X_val_pre   = np.hstack([X_val_num,   X_val[:,   len(numerical_features):]])

cat_idxs = [i + len(numerical_features) for i in range(len(categorical_features))]
cat_dims = [int(df[col].nunique()) for col in categorical_features]

In [None]:
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=16,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    scheduler_params={"step_size":10, "gamma":0.5},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax'
)



F1 is not implemented for TabNet, so we use AUC instead

In [None]:
clf.fit(
    X_train_pre, y_train,
    eval_set=[(X_train_pre, y_train), (X_val_pre, y_val)],
    eval_name=['train','valid'],
    eval_metric=['accuracy','auc'],
    max_epochs=100,
    patience=10,
    batch_size=256,
    virtual_batch_size=64,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 0.80084 | train_accuracy: 0.50758 | train_auc: 0.5033  | valid_accuracy: 0.49941 | valid_auc: 0.50226 |  0:00:01s
epoch 1  | loss: 0.70075 | train_accuracy: 0.50936 | train_auc: 0.50462 | valid_accuracy: 0.50772 | valid_auc: 0.51122 |  0:00:02s
epoch 2  | loss: 0.69562 | train_accuracy: 0.50787 | train_auc: 0.52121 | valid_accuracy: 0.5101  | valid_auc: 0.53032 |  0:00:04s
epoch 3  | loss: 0.69481 | train_accuracy: 0.51099 | train_auc: 0.53944 | valid_accuracy: 0.50713 | valid_auc: 0.52105 |  0:00:05s
epoch 4  | loss: 0.69406 | train_accuracy: 0.51411 | train_auc: 0.54411 | valid_accuracy: 0.50416 | valid_auc: 0.5346  |  0:00:07s
epoch 5  | loss: 0.69296 | train_accuracy: 0.53075 | train_auc: 0.55399 | valid_accuracy: 0.51781 | valid_auc: 0.54125 |  0:00:08s
epoch 6  | loss: 0.69061 | train_accuracy: 0.53491 | train_auc: 0.56092 | valid_accuracy: 0.51544 | valid_auc: 0.54207 |  0:00:10s
epoch 7  | loss: 0.69071 | train_accuracy: 0.55496 | train_auc: 0.5841  | valid_acc



In [None]:
y_pred = clf.predict(X_val_pre)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation F1 Score: {f1:.4f}")

torch.save(clf, 'tabnet_model.pth')

Validation Accuracy: 0.5772
Validation F1 Score: 0.6118


# Tab Transformer

In [14]:
class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.from_numpy(X_num).float()
        self.X_cat = torch.from_numpy(X_cat).long()
        self.y = torch.from_numpy(y).float().unsqueeze(1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X_num[idx], self.X_cat[idx], self.y[idx]

In [15]:
target_col = 'target'
categorical_features = ['subreddit','type','ticker']
feature_cols = ['score','num_comments','text_length','sentiment_scores','created_price'] + categorical_features
numerical_features = [c for c in feature_cols if c not in categorical_features]

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

X = df[feature_cols].values
y = df[target_col].values
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
scaler = StandardScaler()
num_idxs = [feature_cols.index(c) for c in numerical_features]
X_train_num = scaler.fit_transform(X_train[:, num_idxs])
X_val_num = scaler.transform(X_val[:, num_idxs])

cat_idxs = [feature_cols.index(c) for c in categorical_features]
X_train_cat = X_train[:, cat_idxs].astype(int)
X_val_cat = X_val[:, cat_idxs].astype(int)

train_cont_tensor = torch.tensor(X_train_num, dtype=torch.float)
cont_mean_std = torch.stack([train_cont_tensor.mean(0), train_cont_tensor.std(0)], dim=1)


In [None]:
num_continuous = len(numerical_features)
categories = tuple(int(df[c].nunique()) for c in categorical_features)


transformer = TabTransformer(
    categories=categories,
    num_continuous=num_continuous,
    dim=32,
    depth=6,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4,2),
    mlp_act=nn.ReLU(),
    dim_out=1,
    continuous_mean_std=cont_mean_std
)
model = transformer


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [18]:
train_ds = TabularDataset(X_train_num, X_train_cat, y_train)
val_ds   = TabularDataset(X_val_num,   X_val_cat,   y_val)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False)

In [19]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)



In [20]:
EPOCHS=50
for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss, preds, labs = 0.0, [], []
    for xb_num, xb_cat, yb in train_loader:
        xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
        logits = model(xb_cat, xb_num)
        loss = criterion(logits, yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        train_loss += loss.item() * xb_num.size(0)
        probs = torch.sigmoid(logits)
        preds.extend((probs.detach().cpu().numpy()>0.5).astype(int).flatten().tolist())
        labs.extend(yb.cpu().numpy().astype(int).flatten().tolist())
    train_loss /= len(train_ds)
    train_acc = accuracy_score(labs, preds)
    train_f1 = f1_score(labs, preds)
    print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")

    if epoch % 10 == 0:
        model.eval()
        val_loss, vpreds, vlabs = 0.0, [], []
        with torch.no_grad():
            for xb_num, xb_cat, yb in val_loader:
                xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
                logits = model(xb_cat, xb_num)
                val_loss += criterion(logits, yb).item() * xb_num.size(0)
                probs = torch.sigmoid(logits)
                vpreds.extend((probs.cpu().numpy()>0.5).astype(int).flatten().tolist())
                vlabs.extend(yb.cpu().numpy().astype(int).flatten().tolist())
        val_loss /= len(val_ds)
        val_acc = accuracy_score(vlabs, vpreds)
        val_f1 = f1_score(vlabs, vpreds)
        print(f"--- Val @Epoch {epoch} | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f} ---")
        scheduler.step(val_loss)

if epoch == EPOCHS:
    torch.save(model.state_dict(), 'tabtransformer_lucidrains.pth')
    joblib.dump(scaler, 'scaler_tt.pkl')
    joblib.dump(label_encoders, 'label_encoders_tt.pkl')

Epoch 1 | Train Loss: 0.6997 | Acc: 0.5290 | F1: 0.4307
Epoch 2 | Train Loss: 0.6817 | Acc: 0.5592 | F1: 0.4623
Epoch 3 | Train Loss: 0.6728 | Acc: 0.5818 | F1: 0.5113
Epoch 4 | Train Loss: 0.6657 | Acc: 0.5888 | F1: 0.5088
Epoch 5 | Train Loss: 0.6608 | Acc: 0.5982 | F1: 0.5497
Epoch 6 | Train Loss: 0.6493 | Acc: 0.6048 | F1: 0.5548
Epoch 7 | Train Loss: 0.6422 | Acc: 0.6141 | F1: 0.5633
Epoch 8 | Train Loss: 0.6316 | Acc: 0.6177 | F1: 0.5702
Epoch 9 | Train Loss: 0.6205 | Acc: 0.6312 | F1: 0.5974
Epoch 10 | Train Loss: 0.6110 | Acc: 0.6368 | F1: 0.5892
--- Val @Epoch 10 | Loss: 0.6959 | Acc: 0.5785 | F1: 0.5063 ---
Epoch 11 | Train Loss: 0.6012 | Acc: 0.6460 | F1: 0.6066
Epoch 12 | Train Loss: 0.5962 | Acc: 0.6474 | F1: 0.6141
Epoch 13 | Train Loss: 0.5857 | Acc: 0.6538 | F1: 0.6076
Epoch 14 | Train Loss: 0.5763 | Acc: 0.6519 | F1: 0.6237
Epoch 15 | Train Loss: 0.5717 | Acc: 0.6577 | F1: 0.6332
Epoch 16 | Train Loss: 0.5705 | Acc: 0.6610 | F1: 0.6266
Epoch 17 | Train Loss: 0.5610 | A