In [2]:
import os
import sys
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import numpy as np
import pandas as pd

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    
from data_handle import(
    ForexDataClean,
    ForexDataLoad
)

from features import ForexFeauturesExtractor

from plots import ForexPlotter
from models import TimeSeriesAutoencoder
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

data = ForexDataLoad(file_path = '../data/usdjpy.csv').data

cleaner = ForexDataClean(data = data)
data = cleaner.fast_cleaner()

features = ForexFeauturesExtractor(
    data = data,
    momentum_parameters = {
        'rsi_periods' : [4, 6, 8, 10, 12, 14]*10,
        'stoch_fk_sk_sd_periods' : [[4, 1, 1], [6, 1, 1], [8, 2, 2], [10, 2, 2], [12, 3, 3], [14, 3, 3]]*10,
        'williams_periods' :  [4, 6, 8, 10, 12, 14]*10,
        'cci_periods' :  [4, 6, 8, 10, 12, 14]*10,
        'momentum_periods' : [4, 6, 8, 10, 12, 14]*10
    },
)

indicators_data, signals_data, extreme_data = features.extract_all_features()

FOREX DATA LOADER
 Available Fuctions 
1 load_csv 
2 load_from_database
Data loaded successfully!
Shape: (41476, 5)

FOREX DATA CLEANER
 Available Fuctions 
1 remove_duplicates 
2 handle_missing_values 
3 validate_ohlc_integrity 
4 handle_outliers 
5 fast_cleaner
Keep = first and subset = None
Removed 0 duplicate entries
OHLC DATA INTEGRITY VALIDATION
No OHLC integrity violations found
All OHLC values are consistent
No missing values found
OHLC DATA INTEGRITY VALIDATION
No OHLC integrity violations found
All OHLC values are consistent


In [3]:
print(signals_data['divergence'].head())

                       close  rsi_4_divergence  rsi_6_divergence  \
datetime                                                           
2018-01-12 08:00:00  111.327                 0                 0   
2018-01-12 09:00:00  111.368                 0                 0   
2018-01-12 10:00:00  111.223                 0                 0   
2018-01-12 11:00:00  111.107                 0                 0   
2018-01-12 12:00:00  111.078                 0                 0   

                     rsi_8_divergence  rsi_10_divergence  rsi_12_divergence  \
datetime                                                                      
2018-01-12 08:00:00                 0                  0                  0   
2018-01-12 09:00:00                 0                  0                  0   
2018-01-12 10:00:00                 0                  0                  0   
2018-01-12 11:00:00                 0                  0                  0   
2018-01-12 12:00:00                 0            

In [4]:
print(extreme_data.head())

                       close  Label_p5_o1  Label_p5_o2  Label_p5_o3  \
datetime                                                              
2018-01-12 08:00:00  111.327            0            0            0   
2018-01-12 09:00:00  111.368            2            2            2   
2018-01-12 10:00:00  111.223            0            0            0   
2018-01-12 11:00:00  111.107            0            0            0   
2018-01-12 12:00:00  111.078            1            1            1   

                     Label_p5_o4  Label_p5_o5  Label_p5_o10  Label_p5_o20  \
datetime                                                                    
2018-01-12 08:00:00            0            0             0             0   
2018-01-12 09:00:00            2            2             0             0   
2018-01-12 10:00:00            0            0             0             0   
2018-01-12 11:00:00            0            0             0             0   
2018-01-12 12:00:00            1        

In [5]:
merged_data = pd.merge(
    signals_data['divergence'],
    extreme_data, 
    left_index=True, 
    right_index=True, 
    how='inner', 
    suffixes=('_signals', '_extreme')
)

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
import matplotlib.pyplot as plt

In [11]:
def find_high_probability_features(merged_data, min_probability=0.7):

    results = []
    
    feature_columns = [col for col in merged_data.columns if not col.startswith('Label_')]
    label_columns = [col for col in merged_data.columns if col.startswith('Label_')]
    
    print(f"Αναλύοντας {len(feature_columns)} features vs {len(label_columns)} labels...")
    
    for feature in feature_columns:
        for label in label_columns:
            total_cases = len(merged_data)
            feature_1_cases = len(merged_data[merged_data[feature] == 1])
            
            if feature_1_cases > 0:
                label_1_given_feature_1 = len(
                    merged_data[(merged_data[feature] == 1) & (merged_data[label] == 1)]
                ) / feature_1_cases
                
                baseline_prob = len(merged_data[merged_data[label] == 1]) / total_cases
                
                if label_1_given_feature_1 >= min_probability:
                    results.append({
                        'feature': feature,
                        'label': label,
                        'p(label=1|feature=1)': label_1_given_feature_1,
                        'baseline_prob': baseline_prob,
                        'improvement': label_1_given_feature_1 - baseline_prob,
                        'feature_frequency': feature_1_cases / total_cases
                    })
    
    results_df = pd.DataFrame(results)
    
    if len(results_df) == 0:
        print(f"Δεν βρέθηκαν features με πιθανότητα >= {min_probability:.0%}")
        return pd.DataFrame()  
    
    sorted_results = results_df.sort_values(['p(label=1|feature=1)', 'improvement'], ascending=False)
    
    print(f"Βρέθηκαν {len(sorted_results)} κανόνες με πιθανότητα >= {min_probability:.0%}")
    return sorted_results

def comprehensive_analysis(merged_data):

    thresholds = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
    
    for threshold in thresholds:
        print(f"\n{'='*60}")
        print(f"ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= {threshold:.0%}")
        print(f"{'='*60}")
        
        results = find_high_probability_features(merged_data, min_probability=threshold)
        
        if len(results) > 0:
            print(f"\n TOP 10 ΚΑΝΟΝΕΣ ({threshold:.0%}+ πιθανότητα):")
            display(results.head(10))
            
            print(f"\nΑΝΑΛΥΣΗ ΑΝΑ FEATURE ({threshold:.0%}+):")
            feature_summary = results.groupby('feature').agg({
                'p(label=1|feature=1)': 'mean',
                'label': 'count'
            }).rename(columns={'label': 'num_labels'}).sort_values('num_labels', ascending=False)
            
            print(f"Πιο σημαντικά features:")
            display(feature_summary.head(10))
            
            break 
        else:
            print(f"Δεν βρέθηκαν κανόνες με πιθανότητα >= {threshold:.0%}")

# Τρέξε την ολοκληρωμένη ανάλυση
comprehensive_analysis(merged_data)


ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 80%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 80%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 80%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 70%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 70%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 70%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 60%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 60%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 60%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 50%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 50%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 50%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 40%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 40%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 40%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 30%
Αναλύοντας 34 features vs 35 labels...
Δεν βρέθηκαν features με πιθανότητα >= 30%
Δεν βρέθηκαν κανόνες με πιθανότητα >= 30%

ΑΝΑΛΥΣΗ ΜΕ ΠΙΘΑΝΟΤΗΤΑ >= 20%
Αναλύοντας 34 features vs 35 labels...
Βρέθηκα

Unnamed: 0,feature,label,p(label=1|feature=1),baseline_prob,improvement,feature_frequency
45,cci_4_divergence,Label_p5_o1,0.283403,0.254434,0.028969,0.110791
46,cci_4_divergence,Label_p10_o1,0.283403,0.254434,0.028969,0.110791
47,cci_4_divergence,Label_p20_o1,0.283403,0.254434,0.028969,0.110791
48,cci_4_divergence,Label_p50_o1,0.283403,0.254434,0.028969,0.110791
49,cci_4_divergence,Label_p100_o1,0.283403,0.254434,0.028969,0.110791
15,williams_r_4_divergence,Label_p5_o1,0.276202,0.254434,0.021769,0.115345
16,williams_r_4_divergence,Label_p10_o1,0.276202,0.254434,0.021769,0.115345
17,williams_r_4_divergence,Label_p20_o1,0.276202,0.254434,0.021769,0.115345
18,williams_r_4_divergence,Label_p50_o1,0.276202,0.254434,0.021769,0.115345
19,williams_r_4_divergence,Label_p100_o1,0.276202,0.254434,0.021769,0.115345



ΑΝΑΛΥΣΗ ΑΝΑ FEATURE (20%+):
Πιο σημαντικά features:


Unnamed: 0_level_0,p(label=1|feature=1),num_labels
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
bb_20_2.0_2.0_bullish_bearish,0.249411,5
cci_10_divergence,0.248586,5
williams_r_6_divergence,0.271078,5
williams_r_4_divergence,0.276202,5
williams_r_14_divergence,0.262484,5
williams_r_12_divergence,0.258007,5
williams_r_10_divergence,0.253635,5
stoch_slowk_3_divergence,0.222502,5
stoch_slowk_2_divergence,0.229112,5
stoch_slowk_1_divergence,0.271078,5
