## Classical ML on embeddings

In [16]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import classification_report


BASE = "../data/processed_data/embedded_data"

# Load embeddings
X_train = np.load(f"{BASE}/train_X.npy")
y_train = np.load(f"{BASE}/train_y.npy")

X_val = np.load(f"{BASE}/val_X.npy")
y_val = np.load(f"{BASE}/val_y.npy")

X_testA = np.load(f"{BASE}/testA_X.npy")
y_testA = np.load(f"{BASE}/testA_y.npy")

X_testB = np.load(f"{BASE}/testB_X.npy")
y_testB = np.load(f"{BASE}/testB_y.npy")

# Train XGBoost model
model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method="hist"
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Evaluate
print("\nVALIDATION RESULTS:")
print(classification_report(y_val, model.predict(X_val)))

print("\nTEST A RESULTS:")
print(classification_report(y_testA, model.predict(X_testA)))

print("\nTEST B RESULTS:")
print(classification_report(y_testB, model.predict(X_testB)))

[0]	validation_0-logloss:0.66294
[1]	validation_0-logloss:0.63541
[2]	validation_0-logloss:0.61016
[3]	validation_0-logloss:0.58638
[4]	validation_0-logloss:0.56460
[5]	validation_0-logloss:0.54421
[6]	validation_0-logloss:0.52501
[7]	validation_0-logloss:0.50733
[8]	validation_0-logloss:0.49051
[9]	validation_0-logloss:0.47485
[10]	validation_0-logloss:0.46033
[11]	validation_0-logloss:0.44650
[12]	validation_0-logloss:0.43334
[13]	validation_0-logloss:0.42066
[14]	validation_0-logloss:0.40896
[15]	validation_0-logloss:0.39783
[16]	validation_0-logloss:0.38736
[17]	validation_0-logloss:0.37758
[18]	validation_0-logloss:0.36822
[19]	validation_0-logloss:0.35922
[20]	validation_0-logloss:0.35071
[21]	validation_0-logloss:0.34257
[22]	validation_0-logloss:0.33464
[23]	validation_0-logloss:0.32708
[24]	validation_0-logloss:0.32006
[25]	validation_0-logloss:0.31343
[26]	validation_0-logloss:0.30703
[27]	validation_0-logloss:0.30106
[28]	validation_0-logloss:0.29536
[29]	validation_0-loglos

In [20]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


print("Shapes:")
print("Train:", X_train.shape, y_train.shape)
print("Val:", X_val.shape, y_val.shape)
print("TestA:", X_testA.shape, y_testA.shape)
print("TestB:", X_testB.shape, y_testB.shape)

# -----------------------------
# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,    # more trees = stronger model
    max_depth=40,      # allow deep splits
    n_jobs=-1,           # use all CPU cores
    class_weight="balanced_subsample"
)

print("\nTraining Random Forest...")
rf.fit(X_train, y_train)


# Evaluation
print("\nVALIDATION RESULTS:")
val_pred = rf.predict(X_val)
print(classification_report(y_val, val_pred))

print("\nTEST A RESULTS:")
testA_pred = rf.predict(X_testA)
print(classification_report(y_testA, testA_pred))

print("\nTEST B RESULTS:")
testB_pred = rf.predict(X_testB)
print(classification_report(y_testB, testB_pred))


Shapes:
Train: (93838, 768) (93838,)
Val: (30000, 768) (30000,)
TestA: (55000, 768) (55000,)
TestB: (105000, 768) (105000,)

Training Random Forest...

VALIDATION RESULTS:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     25000
           1       0.92      1.00      0.96      5000

    accuracy                           0.98     30000
   macro avg       0.96      0.99      0.97     30000
weighted avg       0.99      0.98      0.99     30000


TEST A RESULTS:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     50000
           1       0.84      1.00      0.91      5000

    accuracy                           0.98     55000
   macro avg       0.92      0.99      0.95     55000
weighted avg       0.99      0.98      0.98     55000


TEST B RESULTS:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    100000
           1       0.73      1.

In [18]:
X_testB

array([[ 0.2244125 , -0.16956939,  0.27139345, ...,  0.1413871 ,
         0.4591037 , -0.07288239],
       [ 0.38217783, -0.24664646,  0.37465507, ...,  0.1938035 ,
         0.03496653,  0.23313862],
       [ 0.06724552, -0.2850793 ,  0.5696657 , ...,  0.16982377,
         0.6484569 ,  0.0658012 ],
       ...,
       [ 0.537127  , -0.26793212,  0.11271805, ...,  0.2048    ,
         0.2321334 ,  0.1289423 ],
       [ 0.05377357, -0.19055726,  0.3637885 , ...,  0.21040887,
         0.8144214 , -0.1019829 ],
       [ 0.08538476,  0.09404416,  0.60888296, ...,  0.2659546 ,
         0.7134516 , -0.06314323]], shape=(105000, 768), dtype=float32)