## Classical ML on embeddings

In [1]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import classification_report


BASE = "../data/processed_data/embedded_data"

# Load embeddings
X_train = np.load(f"{BASE}/train_X.npy")
y_train = np.load(f"{BASE}/train_y.npy")

X_val = np.load(f"{BASE}/val_X.npy")
y_val = np.load(f"{BASE}/val_y.npy")

X_testA = np.load(f"{BASE}/testA_X.npy")
y_testA = np.load(f"{BASE}/testA_y.npy")

#X_testB = np.load(f"{BASE}/testB_X.npy")
#y_testB = np.load(f"{BASE}/testB_y.npy")

# Train XGBoost model
model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method="hist"
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Evaluate
print("\nVALIDATION RESULTS:")
print(classification_report(y_val, model.predict(X_val)))

print("\nTEST A RESULTS:")
print(classification_report(y_testA, model.predict(X_testA)))

print("\nTEST B RESULTS:")
#print(classification_report(y_testB, model.predict(X_testB)))

[0]	validation_0-logloss:0.68015
[1]	validation_0-logloss:0.66858
[2]	validation_0-logloss:0.65766
[3]	validation_0-logloss:0.64727
[4]	validation_0-logloss:0.63788
[5]	validation_0-logloss:0.62862
[6]	validation_0-logloss:0.61994
[7]	validation_0-logloss:0.61252
[8]	validation_0-logloss:0.60496
[9]	validation_0-logloss:0.59744
[10]	validation_0-logloss:0.59038
[11]	validation_0-logloss:0.58349
[12]	validation_0-logloss:0.57710
[13]	validation_0-logloss:0.57066
[14]	validation_0-logloss:0.56481
[15]	validation_0-logloss:0.55857
[16]	validation_0-logloss:0.55325
[17]	validation_0-logloss:0.54768
[18]	validation_0-logloss:0.54316
[19]	validation_0-logloss:0.53858
[20]	validation_0-logloss:0.53400
[21]	validation_0-logloss:0.52918
[22]	validation_0-logloss:0.52454
[23]	validation_0-logloss:0.52029
[24]	validation_0-logloss:0.51593
[25]	validation_0-logloss:0.51209
[26]	validation_0-logloss:0.50818
[27]	validation_0-logloss:0.50464
[28]	validation_0-logloss:0.50088
[29]	validation_0-loglos

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


print("Shapes:")
print("Train:", X_train.shape, y_train.shape)
print("Val:", X_val.shape, y_val.shape)
print("TestA:", X_testA.shape, y_testA.shape)
#print("TestB:", X_testB.shape, y_testB.shape)

# -----------------------------
# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,    # more trees = stronger model
    max_depth=40,      # allow deep splits
    n_jobs=-1,           # use all CPU cores
    class_weight="balanced_subsample"
)

print("\nTraining Random Forest...")
rf.fit(X_train, y_train)


# Evaluation
print("\nVALIDATION RESULTS:")
val_pred = rf.predict(X_val)
print(classification_report(y_val, val_pred))

print("\nTEST A RESULTS:")
testA_pred = rf.predict(X_testA)
print(classification_report(y_testA, testA_pred))

print("\nTEST B RESULTS:")
#testB_pred = rf.predict(X_testB)
#print(classification_report(y_testB, testB_pred))


Shapes:
Train: (294274, 768) (294274,)
Val: (28146, 768) (28146,)
TestA: (103202, 768) (103202,)

Training Random Forest...

VALIDATION RESULTS:
              precision    recall  f1-score   support

           0       0.99      0.89      0.94     23455
           1       0.64      0.95      0.76      4691

    accuracy                           0.90     28146
   macro avg       0.81      0.92      0.85     28146
weighted avg       0.93      0.90      0.91     28146


TEST A RESULTS:
              precision    recall  f1-score   support

           0       0.99      0.89      0.94     93820
           1       0.46      0.95      0.62      9382

    accuracy                           0.90    103202
   macro avg       0.73      0.92      0.78    103202
weighted avg       0.95      0.90      0.91    103202


TEST B RESULTS:
