In [5]:
# ============================================
# 1. Imports & Global Settings
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib
import json

plt.style.use("ggplot")
pd.set_option("display.max_columns", 200)


In [7]:
# ============================================
# 2. Load and Inspect Dataset
# ============================================

df = pd.read_csv("data/cac40_features.csv")

print("Shape:", df.shape)
df.head()


Shape: (4011, 14)


Unnamed: 0,symbol,date,open,high,low,close,volume,adjclose,Return,MA20,MA50,Volatility,RSI,Target
0,^FCHI,2010-03-12,3933.969971,3956.419922,3919.070068,3927.399902,114568600.0,3927.399902,-0.000395,3791.305493,3820.455796,0.01068,67.274531,0
1,^FCHI,2010-03-15,3921.0,3925.860107,3889.129883,3890.909912,94752800.0,3890.909912,-0.009291,3805.38999,3817.994595,0.01111,69.113996,1
2,^FCHI,2010-03-16,3914.389893,3951.469971,3910.810059,3938.949951,108838800.0,3938.949951,0.012347,3818.885486,3816.515396,0.010893,71.453827,1
3,^FCHI,2010-03-17,3954.300049,3967.51001,3943.169922,3957.889893,124548900.0,3957.889893,0.004808,3830.519482,3815.319795,0.010547,84.144428,0
4,^FCHI,2010-03-18,3936.939941,3964.780029,3923.26001,3938.179932,125638800.0,3938.179932,-0.00498,3840.036975,3813.587393,0.010671,77.565718,0


In [25]:
# ============================================
# 3. Features & Target
# ============================================

target = "Target"

# Remove non-numeric columns (ticker, date, etc.)
non_numeric = df.select_dtypes(exclude=["number"]).columns
print("Removed non-numeric columns:", non_numeric)

df = df.drop(columns=non_numeric)

features = [c for c in df.columns if c != target]

X = df[features]
y = df[target]


print("Number of features:", len(features))
features


Removed non-numeric columns: Index([], dtype='object')
Number of features: 11


['open',
 'high',
 'low',
 'close',
 'volume',
 'adjclose',
 'Return',
 'MA20',
 'MA50',
 'Volatility',
 'RSI']

In [27]:
# ============================================
# 4. Basic Preprocessing
# ============================================

# Drop missing rows
df = df.dropna()

# Target variable: assume "target" is binary: 0 = down, 1 = up
target = "Target"

# Features must contain only numeric columns
features = [col for col in df.columns if col != target]

X = df[features]
y = df[target]






In [29]:
# ============================================
# 5. Train / Validation / Test Split (60/20/20)
# ============================================

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


Train: (2406, 11)
Val: (802, 11)
Test: (803, 11)


In [31]:
# ============================================
# 6. Scaling
# ============================================

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

print("Scaling completed.")



Scaling completed.


In [34]:
# ============================================
# 7. Train & Tune Models
# ============================================

results = {}

# --------------------------
# 7.1 Logistic Regression
# --------------------------
log_params = {
    "C": [0.001, 0.01, 0.1],
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}

log_reg = LogisticRegression(max_iter=5000)
grid_log = GridSearchCV(log_reg, log_params, cv=3, scoring="roc_auc")
grid_log.fit(X_train_scaled, y_train)

best_log = grid_log.best_estimator_
results["Logistic Regression"] = grid_log.best_score_


# --------------------------
# 7.2 Random Forest
# --------------------------
rf_params = {
    "n_estimators": [100, 300],
    "max_depth": [3, 5, 7],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, rf_params, cv=3, scoring="roc_auc")
grid_rf.fit(X_train_scaled, y_train)

best_rf = grid_rf.best_estimator_
results["Random Forest"] = grid_rf.best_score_


# --------------------------
# 7.3 XGBoost
# --------------------------
xgb_params = {
    "n_estimators": [100, 200],
    "max_depth": [2, 3],
    "learning_rate": [0.01, 0.05],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.7, 1.0]
}

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False
)

grid_xgb = GridSearchCV(xgb, xgb_params, cv=3, scoring="roc_auc")
grid_xgb.fit(X_train_scaled, y_train)

best_xgb = grid_xgb.best_estimator_
results["XGBoost"] = grid_xgb.best_score_

results


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

{'Logistic Regression': 0.5051173843690625,
 'Random Forest': 0.5181662956249806,
 'XGBoost': 0.5252482209217448}

In [36]:
# ============================================
# 8. Evaluation
# ============================================

def evaluate(model, name):
    print(f"\n========== {name} ==========")
    
    # Validation
    y_pred_val = model.predict(X_val_scaled)
    y_proba_val = model.predict_proba(X_val_scaled)[:,1]
    
    print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
    print("Validation F1:", f1_score(y_val, y_pred_val))
    print("Validation AUC:", roc_auc_score(y_val, y_proba_val))
    
    print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
    
    # Test
    y_pred_test = model.predict(X_test_scaled)
    y_proba_test = model.predict_proba(X_test_scaled)[:,1]
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred_test))
    print("Test F1:", f1_score(y_test, y_pred_test))
    print("Test AUC:", roc_auc_score(y_test, y_proba_test))
    
    print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


evaluate(best_log, "Logistic Regression")
evaluate(best_rf, "Random Forest")
evaluate(best_xgb, "XGBoost")



Validation Accuracy: 0.5236907730673317
Validation F1: 0.6873977086743044
Validation AUC: 0.4913799551234106
Validation Confusion Matrix:
 [[  0 382]
 [  0 420]]

Test Accuracy: 0.5242839352428393
Test F1: 0.6879084967320261
Test AUC: 0.5206563778587507
Test Confusion Matrix:
 [[  0 382]
 [  0 421]]

Validation Accuracy: 0.5286783042394015
Validation F1: 0.622
Validation AUC: 0.5164204687110446
Validation Confusion Matrix:
 [[113 269]
 [109 311]]

Test Accuracy: 0.5193026151930261
Test F1: 0.6147704590818364
Test AUC: 0.5098929250973125
Test Confusion Matrix:
 [[109 273]
 [113 308]]

Validation Accuracy: 0.5187032418952618
Validation F1: 0.6471663619744058
Validation AUC: 0.5203689852904512
Validation Confusion Matrix:
 [[ 62 320]
 [ 66 354]]

Test Accuracy: 0.5305105853051059
Test F1: 0.6630920464700626
Test AUC: 0.5303347800674036
Test Confusion Matrix:
 [[ 55 327]
 [ 50 371]]


In [38]:
# ============================================
# 9. Save Best Model + Scaler + Features
# ============================================

# Choose best model by AUC
test_aucs = {
    "log_reg": roc_auc_score(y_test, best_log.predict_proba(X_test_scaled)[:,1]),
    "rf": roc_auc_score(y_test, best_rf.predict_proba(X_test_scaled)[:,1]),
    "xgb": roc_auc_score(y_test, best_xgb.predict_proba(X_test_scaled)[:,1])
}

best_name = max(test_aucs, key=test_aucs.get)
print("Best model:", best_name)

if best_name == "log_reg":
    final_model = best_log
elif best_name == "rf":
    final_model = best_rf
else:
    final_model = best_xgb


# Save model
joblib.dump(final_model, "model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save feature names
with open("features.json", "w") as f:
    json.dump(features, f)

print("Saved: model.pkl, scaler.pkl, features.json")


Best model: xgb
Saved: model.pkl, scaler.pkl, features.json


Best XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}



==== Logistic Regression ====

Validation AUC: 0.5027341914873626
Test AUC: 0.5155763239875389
Confusion Matrix (test):
[[303  72]
 [331  97]]

==== Random Forest ====

Validation AUC: 0.4997713677221335
Test AUC: 0.48225233644859816
Confusion Matrix (test):
[[340  35]
 [386  42]]

==== XGBoost ====

Validation AUC: 0.5157067242945285
Test AUC: 0.5012461059190031
Confusion Matrix (test):
[[369   6]
 [415  13]]


Unnamed: 0,name,val_auc,test_auc
0,Logistic Regression,0.502734,0.515576
1,Random Forest,0.499771,0.482252
2,XGBoost,0.515707,0.501246
