In [102]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
import json
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [103]:
df = pd.read_csv("../data/EURUSD_with_signals_with_indicators_and_combos", parse_dates=["datetime"])

In [104]:
with open("../data/selected_combos.json", "r") as f:
    combos = json.load(f)

with open("../data/cluster_rule_map.json", "r") as f:
    rule_map = json.load(f)

In [105]:
all_rules = {}
all_clusters = pd.Series(index=df.index, dtype="object")
df.dropna()

Unnamed: 0,datetime,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,ema_5,ema_10,ema_20,ema_50,...,bb_lower,bb_mid,bb_bandwidth,stoch_k,stoch_d,atr,adx,adx_pos,adx_neg,signal
199,2024-02-16 04:00:00,1.07648,1.07651,1.07572,1.07602,6534,1.075951,1.075136,1.074794,1.076143,...,1.068727,1.073715,0.929060,72.777778,79.222222,0.001970,26.150316,23.374136,15.772366,Sell_Hold
200,2024-02-16 08:00:00,1.07603,1.07717,1.07585,1.07661,13414,1.076171,1.075404,1.074967,1.076162,...,1.068731,1.073712,0.927751,79.333333,76.666667,0.001924,26.025017,24.679268,14.999245,Sell_Hold
201,2024-02-16 12:00:00,1.07661,1.07800,1.07321,1.07460,15180,1.075647,1.075258,1.074932,1.076100,...,1.068796,1.073608,0.896504,57.000000,69.703704,0.002128,24.290867,20.711790,21.448288,Buy
202,2024-02-16 16:00:00,1.07465,1.07779,1.07409,1.07777,24181,1.076355,1.075715,1.075202,1.076166,...,1.068688,1.073664,0.926928,92.081448,76.138260,0.002241,22.680584,18.268694,18.918317,Buy_Hold
203,2024-02-16 20:00:00,1.07778,1.07875,1.07728,1.07762,10346,1.076776,1.076061,1.075433,1.076223,...,1.068918,1.074002,0.946774,86.049383,78.376944,0.002185,21.527459,20.528565,18.009403,Buy_Hold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2304,2025-06-25 00:00:00,1.16072,1.16267,1.16034,1.16251,4985,1.160998,1.158859,1.156128,1.152706,...,1.144631,1.154753,1.753035,91.325173,88.540004,0.003703,22.527830,28.086640,13.722630,Sell
2305,2025-06-25 04:00:00,1.16251,1.16313,1.16109,1.16137,7430,1.161122,1.159316,1.156627,1.153046,...,1.145075,1.155327,1.774832,85.258116,86.712790,0.003584,23.477471,27.861516,13.164756,Sell_Hold
2306,2025-06-25 08:00:00,1.16138,1.16226,1.15899,1.16099,9715,1.161078,1.159620,1.157043,1.153358,...,1.145327,1.155763,1.805919,83.235764,86.606351,0.003562,23.399018,26.034469,16.512767,Sell_Hold
2307,2025-06-25 12:00:00,1.16098,1.16133,1.15901,1.15928,8906,1.160479,1.159558,1.157256,1.153590,...,1.145648,1.156092,1.806768,74.135178,80.876353,0.003473,23.326169,24.792284,15.724891,Hold


In [106]:
for combo_id, features in combos.items():
    print(f"\n🔍 Clustering for {combo_id}: {features}")

    # dropna για όλα τα features + signal για να αποφύγουμε προβλήματα
    valid_df = df[features + ["signal"]].dropna()
    if valid_df.empty:
        print(f"⚠️  Skipping {combo_id} due to insufficient data after dropna.")
        continue

    X = valid_df[features]
    y = valid_df["signal"]

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    tree = DecisionTreeClassifier(max_depth=4, random_state=42)
    tree.fit(X, y_encoded)

    rules = export_text(tree, feature_names=features)
    all_rules[combo_id] = rules

    # Εφαρμογή του decision tree για παραγωγή predicted cluster
    pred = tree.predict(X)

    # Βάζουμε cluster id (π.χ. C10) ως ετικέτα
    all_clusters.loc[valid_df.index] = combo_id




🔍 Clustering for C10: ['bb_bandwidth', 'rsi', 'stoch_k']

🔍 Clustering for C11: ['macd_diff', 'rsi', 'atr']

🔍 Clustering for C12: ['stoch_k', 'rsi', '<TICKVOL>']

🔍 Clustering for C13: ['macd_diff', 'ema_50', 'bb_bandwidth']

🔍 Clustering for C14: ['bb_upper', 'rsi']

🔍 Clustering for C15: ['bb_lower', 'rsi']

🔍 Clustering for C16: ['macd', 'macd_signal']


In [107]:
df["cluster"] = all_clusters

In [108]:
with open("../data/cluster_rules.txt", "w") as f:
    for cid, rule_str in all_rules.items():
        f.write(f"--- {cid} ---\n")
        f.write(rule_str + "\n\n")

In [109]:
df.to_csv("../data/EURUSD_with_signals_with_clusters.csv", index=False)