In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings
import joblib
warnings.filterwarnings('ignore')

In [5]:
#XGBoost
train_df = pd.read_csv("/Users/aashnaased/Desktop/TrexQuant_ML_Task/final_train_normalized.csv")
test_df = pd.read_csv("/Users/aashnaased/Desktop/TrexQuant_ML_Task/final_test_normalized.csv")

train_df = train_df.drop(columns=["Date", "patent_expiry"])
test_df = test_df.drop(columns=["Date", "patent_expiry"])

train_df['target'] = (train_df['target_return'] > 0).astype(int)
test_df['target'] = (test_df['target_return'] > 0).astype(int)

X = train_df.drop(columns=['target_return', 'target'])
y = train_df['target']
X_test = test_df.drop(columns=['target_return', 'target'])
y_test = test_df['target']

#Cross Validation
tscv = TimeSeriesSplit(n_splits=5)
val_scores = []

print("Time Series Cross-Validation with Early Stopping")
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.001,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1,
        reg_lambda=1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    model.fit(
        X_train_cv, y_train_cv,
        eval_set=[(X_val_cv, y_val_cv)],
        verbose=False
    )

    preds_cv = model.predict(X_val_cv)
    probas_cv = model.predict_proba(X_val_cv)[:, 1]

    acc = accuracy_score(y_val_cv, preds_cv)
    auc = roc_auc_score(y_val_cv, probas_cv)
    val_scores.append((acc, auc))

    print(f"Fold {fold+1} - Accuracy: {acc:.4f}, AUC: {auc:.4f}")


final_model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.001,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
final_model.fit(
    X, y,
    eval_set=[(X_test, y_test)],
    verbose=False
)


train_preds = final_model.predict(X)
train_acc = accuracy_score(y, train_preds)

test_preds = final_model.predict(X_test)
test_probas = final_model.predict_proba(X_test)[:, 1]
test_acc = accuracy_score(y_test, test_preds)
test_auc = roc_auc_score(y_test, test_probas)

print("\n=== Final Evaluation ===")
print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("Test AUC:", test_auc)
print("Average Validation Accuracy:", np.mean([s[0] for s in val_scores]))
print("Average Validation AUC:", np.mean([s[1] for s in val_scores]))
print("\n=== Classification Report on Test Set ===")
print(classification_report(y_test, test_preds))
#joblib.dump(model, 'best_model_xgb.pkl')

Time Series Cross-Validation with Early Stopping
Fold 1 - Accuracy: 0.7227, AUC: 0.8063
Fold 2 - Accuracy: 0.7713, AUC: 0.8380
Fold 3 - Accuracy: 0.7429, AUC: 0.8369
Fold 4 - Accuracy: 0.7449, AUC: 0.8242
Fold 5 - Accuracy: 0.6923, AUC: 0.7799

=== Final Evaluation ===
Training Accuracy: 0.7807342539575615
Test Accuracy: 0.7375455650060754
Test AUC: 0.808479862183676
Average Validation Accuracy: 0.7348178137651822
Average Validation AUC: 0.8170813354323329

=== Classification Report on Test Set ===
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       443
           1       0.73      0.69      0.71       380

    accuracy                           0.74       823
   macro avg       0.74      0.73      0.74       823
weighted avg       0.74      0.74      0.74       823



In [6]:

tscv = TimeSeriesSplit(n_splits=5)


xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


param_dist = {
    'n_estimators': [500, 750, 1000],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.001, 0.01, 0.05],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_alpha': [0, 1],
    'reg_lambda': [1, 5]
}


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=30,  
    scoring='roc_auc',
    cv=tscv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X, y)

best_model = random_search.best_estimator_

print("\n=== Best Hyperparameters ===")
print(random_search.best_params_)


best_model.fit(X, y)



test_preds = best_model.predict(X_test)
test_probas = best_model.predict_proba(X_test)[:, 1]

print("\n=== Evaluation on Test Set ===")
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test AUC:", roc_auc_score(y_test, test_probas))
print("\nClassification Report:")
print(classification_report(y_test, test_preds))


train_preds = best_model.predict(X)
train_acc = accuracy_score(y, train_preds)

print("Training Accuracy:", train_acc)


joblib.dump(best_model, '/Users/aashnaased/Desktop/best_model_xgb.pkl')

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



=== Best Hyperparameters ===
{'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 1, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

=== Evaluation on Test Set ===
Test Accuracy: 0.7739975698663426
Test AUC: 0.8635380776998931

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       443
           1       0.76      0.74      0.75       380

    accuracy                           0.77       823
   macro avg       0.77      0.77      0.77       823
weighted avg       0.77      0.77      0.77       823

Training Accuracy: 0.8976086224317952


['/Users/aashnaased/Desktop/best_model_xgb.pkl']

[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=1000, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, n_estimators=750, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, reg_alpha=1, reg_lambda=5, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=750, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=500, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=1000, reg_alpha=1, reg_lambda=5, subsample=0.7; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=1000, reg_alpha=1, reg_lambda=5, subsample=0.7; total time=   0