In [38]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [39]:
import pandas as pd
import numpy as np


In [40]:
df = pd.read_csv('../data/df1.csv').drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Date,Return,Log Return,Volatility,Distance_MA50,Distance_EMA20,Cumulated_Return_5d,RSI14,Trend
0,2010-01-04 00:00:00-05:00,,,,,0.000000,,,Bear
1,2010-01-05 00:00:00-05:00,0.002647,0.002644,,,0.001256,,,Bear
2,2010-01-06 00:00:00-05:00,0.000704,0.000703,,,0.001240,,,Bear
3,2010-01-07 00:00:00-05:00,0.004222,0.004213,,,0.003882,,,Bear
4,2010-01-08 00:00:00-05:00,0.003328,0.003322,,,0.005466,,,Bear
...,...,...,...,...,...,...,...,...,...
3768,2024-12-23 00:00:00-05:00,0.005988,0.005970,0.008463,0.008392,-0.002118,-0.016642,43.149037,Range
3769,2024-12-24 00:00:00-05:00,0.011115,0.011054,0.008789,0.018946,0.008112,-0.001599,46.098031,Range
3770,2024-12-26 00:00:00-05:00,0.000067,0.000067,0.008719,0.018203,0.007394,0.029140,46.981964,Range
3771,2024-12-27 00:00:00-05:00,-0.010526,-0.010582,0.009024,0.006985,-0.002905,0.018619,40.972868,Range


In [41]:
df1 = df.copy()
df1['Trend'] = df1['Trend'].map(lambda x: 0 if x == 'Range' else 1 if x == 'Bear' else 2)


days = np.arange(1, 3774, 1)
df1['Date'] = days
df1.head()

Unnamed: 0,Date,Return,Log Return,Volatility,Distance_MA50,Distance_EMA20,Cumulated_Return_5d,RSI14,Trend
0,1,,,,,0.0,,,1
1,2,0.002647,0.002644,,,0.001256,,,1
2,3,0.000704,0.000703,,,0.00124,,,1
3,4,0.004222,0.004213,,,0.003882,,,1
4,5,0.003328,0.003322,,,0.005466,,,1


In [42]:
from sklearn.impute import KNNImputer

#handles missing values et rename the columns accordingly
KNNI = KNNImputer()
new = KNNI.fit_transform(df1)
df1 = pd.DataFrame(new)

df1.columns = [
    "Date", "Return", "Log Return",
    "Volatility", "Distance_MA50", "Distance_EMA20",
    "Cumulated_Return_5d", "RSI14", "Trend"
]
df1

Unnamed: 0,Date,Return,Log Return,Volatility,Distance_MA50,Distance_EMA20,Cumulated_Return_5d,RSI14,Trend
0,1.0,0.002459,0.002455,0.011635,0.045560,0.000000,0.003797,32.310590,1.0
1,2.0,0.002647,0.002644,0.011635,0.045560,0.001256,0.003797,32.310590,1.0
2,3.0,0.000704,0.000703,0.011635,0.045560,0.001240,0.003797,32.310590,1.0
3,4.0,0.004222,0.004213,0.011635,0.045560,0.003882,0.003797,32.310590,1.0
4,5.0,0.003328,0.003322,0.011635,0.045560,0.005466,0.003797,32.310590,1.0
...,...,...,...,...,...,...,...,...,...
3768,3769.0,0.005988,0.005970,0.008463,0.008392,-0.002118,-0.016642,43.149037,0.0
3769,3770.0,0.011115,0.011054,0.008789,0.018946,0.008112,-0.001599,46.098031,0.0
3770,3771.0,0.000067,0.000067,0.008719,0.018203,0.007394,0.029140,46.981964,0.0
3771,3772.0,-0.010526,-0.010582,0.009024,0.006985,-0.002905,0.018619,40.972868,0.0


In [43]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [44]:
X = df1.drop('Trend', axis=1)
y = df1['Trend']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Baseline

In [62]:
from collections import Counter
print("Distribution des classes:")
print(Counter(y_test))

# Baseline
baseline = max(Counter(y_test).values()) / len(y_test)
print(f"Baseline accuracy: {baseline:.2%}")

Distribution des classes:
Counter({0.0: 449, 2.0: 173, 1.0: 133})
Baseline accuracy: 59.47%


### Gaussian Naive Bayes model



In [45]:
NB1 = GaussianNB()
NB1.fit(X_train_scaled, y_train)

y_pred_NB = NB1.predict(X_test_scaled)

conf_matrix = confusion_matrix(y_test, y_pred_NB)
f1 = f1_score(y_test, y_pred_NB, average = 'macro')

accuracy = accuracy_score(y_test, y_pred_NB)

print(f'Confusion matrix for NB1: {conf_matrix}')
print(f'f1 score for NB1: {f1}')
print(f'Accuracy score for NB1: {accuracy}')

Confusion matrix for NB1: [[388   6  55]
 [103   4  26]
 [101  12  60]]
f1 score for NB1: 0.39307186268412525
Accuracy score for NB1: 0.5986754966887418


### Results

The metrics this model gives highlights that it is unable to capture the differences between the three regimes we try to predict. Another model may be able to be more precise adnd accurate based on the features the dataset has. 


### Random Forest Classifier

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:

df_ts = df.copy()
df_ts['Trend'] = df_ts['Trend'].map(lambda x: 0 if x == 'Range' else 1 if x == 'Bear' else 2)


days = np.arange(1, 3774, 1)
df_ts['Date'] = days
df_ts

Unnamed: 0,Date,Return,Log Return,Volatility,Distance_MA50,Distance_EMA20,Cumulated_Return_5d,RSI14,Trend
0,1,,,,,0.000000,,,1
1,2,0.002647,0.002644,,,0.001256,,,1
2,3,0.000704,0.000703,,,0.001240,,,1
3,4,0.004222,0.004213,,,0.003882,,,1
4,5,0.003328,0.003322,,,0.005466,,,1
...,...,...,...,...,...,...,...,...,...
3768,3769,0.005988,0.005970,0.008463,0.008392,-0.002118,-0.016642,43.149037,0
3769,3770,0.011115,0.011054,0.008789,0.018946,0.008112,-0.001599,46.098031,0
3770,3771,0.000067,0.000067,0.008719,0.018203,0.007394,0.029140,46.981964,0
3771,3772,-0.010526,-0.010582,0.009024,0.006985,-0.002905,0.018619,40.972868,0


In [57]:
0.8 * 3772

3017.6000000000004

In [58]:
#train test split 0.8 respecting temporality

X_timed = df_ts.drop(['Trend', 'Date'], axis=1)
y_timed = df_ts['Trend']

X_train_timed = X_timed[:3017]
X_test_timed = X_timed[3017:]
y_train_timed = y_timed[:3017]
y_test_timed = y_timed[3017:]

print(len(X_train_timed) == len(y_train_timed))
print(len(X_test_timed) == len(y_test_timed))
X_train_timed

True
True


Unnamed: 0,Return,Log Return,Volatility,Distance_MA50,Distance_EMA20,Cumulated_Return_5d,RSI14
0,,,,,0.000000,,
1,0.002647,0.002644,,,0.001256,,
2,0.000704,0.000703,,,0.001240,,
3,0.004222,0.004213,,,0.003882,,
4,0.003328,0.003322,,,0.005466,,
...,...,...,...,...,...,...,...
3012,-0.010633,-0.010690,0.012202,-0.007125,-0.014841,-0.021415,50.746202
3013,0.017759,0.017603,0.012910,0.009191,0.002401,0.002864,59.950144
3014,0.009999,0.009949,0.013109,0.017703,0.011228,-0.002692,58.642710
3015,0.006222,0.006203,0.013168,0.022391,0.015824,0.012441,63.852806


In [59]:


KNNI2 = KNNImputer()
KNNI2.fit(X_train_timed)
train_Imp = KNNI2.transform(X_train_timed)
test_Imp = KNNI2.transform(X_test_timed)



In [60]:
scaler_timed = StandardScaler()

scaler_timed.fit(train_Imp)
X_train_timed_scaled = scaler_timed.transform(train_Imp)
X_test_timed_scaled = scaler_timed.transform(test_Imp)

X_train_timed_scaled = pd.DataFrame(X_train_timed_scaled)
X_test_timed_scaled = pd.DataFrame(X_test_timed_scaled)


X_train_timed_scaled.columns = [
    "Return", "Log Return",
    "Volatility", "Distance_MA50", "Distance_EMA20",
    "Cumulated_Return_5d", "RSI14"
]
X_test_timed_scaled.columns = [
    "Return", "Log Return",
    "Volatility", "Distance_MA50", "Distance_EMA20",
    "Cumulated_Return_5d", "RSI14"
]


In [61]:
RFC2 = RandomForestClassifier(n_estimators= 100, 
                              min_samples_leaf=1, 
                              max_features= 0.4, 
                              random_state=0)


RFC2.fit(X_train_timed_scaled, y_train_timed)

y_pred = RFC2.predict(X_test_timed_scaled)

conf_matrix_timed = confusion_matrix(y_test_timed, y_pred)
f1_timed = f1_score(y_test_timed, y_pred, average = 'macro')

accuracy_timed = accuracy_score(y_test_timed, y_pred)

print(f'Confusion matrix for RFC1: {conf_matrix_timed}')
print(f'f1 score for RFC1: {f1_timed}')
print(f'Accuracy score for RFC1: {accuracy_timed}')

Confusion matrix for RFC1: [[256  16  53]
 [147  11  56]
 [139   7  71]]
f1 score for RFC1: 0.34564479875310944
Accuracy score for RFC1: 0.4470899470899471


When evaluated under a strictly time-ordered split, model performance
drops substantially, highlighting the impact of temporal leakage in
random splits and confirming the limited out-of-sample predictability
of market regimes.


### End of V0

Under a strictly time-ordered evaluation protocol, none of the tested models
were able to extract stable and generalizable patterns from the available
technical indicators. This suggests that, in its current formulation,
short-horizon multi-class regime classification is intrinsically difficult,
likely due to the weak and non-stationary nature of the underlying signal.