#pre-run

In [1]:
!pip install tslearn
!pip install mplfinance
!pip install dtw
!pip install pandas-ta
!pip install numba
!pip install tqdm
!pip install Backtesting


import zipfile
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplfinance as mpf
import math
from io import StringIO
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import os
from joblib import load
import csv
import shutil
from sklearn.neighbors import NearestNeighbors
import glob
import re
import pandas_ta as ta
import random
import pickle
from sklearn.neighbors import KDTree
import ast






#main

In [2]:
#functions



#preprocess data to points

def pips(data: np.array, n_pips: int, dist_measure: int):

    # dist_measure
    # 1 = Euclidean Distance
    # 2 = Perpindicular Distance
    # 3 = Vertical Distance




    #first phase converting
    n_pips = n_pips + 2


    pips_x = [0, len(data) - 1]  # Index
    pips_y = [data[0], data[-1]] # Price

    for curr_point in range(2, n_pips):

        md = 0.0 # Max distance
        md_i = -1 # Max distance index
        insert_index = -1

        for k in range(0, curr_point - 1):

            # Left adjacent, right adjacent indices
            left_adj = k
            right_adj = k + 1

            time_diff = pips_x[right_adj] - pips_x[left_adj]
            price_diff = pips_y[right_adj] - pips_y[left_adj]
            slope = price_diff / time_diff
            intercept = pips_y[left_adj] - pips_x[left_adj] * slope;

            for i in range(pips_x[left_adj] + 1, pips_x[right_adj]):

                d = 0.0 # Distance
                if dist_measure == 1: # Euclidean distance
                    d =  ( (pips_x[left_adj] - i) ** 2 + (pips_y[left_adj] - data[i]) ** 2 ) ** 0.5
                    d += ( (pips_x[right_adj] - i) ** 2 + (pips_y[right_adj] - data[i]) ** 2 ) ** 0.5
                elif dist_measure == 2: # Perpindicular distance
                    d = abs( (slope * i + intercept) - data[i] ) / (slope ** 2 + 1) ** 0.5
                else: # Vertical distance
                    d = abs( (slope * i + intercept) - data[i] )

                if d > md:
                    md = d
                    md_i = i
                    insert_index = right_adj

        pips_x.insert(insert_index, md_i)
        pips_y.insert(insert_index, data[md_i])



     #second phase converting
    points_no = len(pips_x)  #number of points
    points = [] #list of new converting data


    #the rate of change of y over the rate of change of x for every point
    for i in range(2,points_no) :

      x = ((pips_x[i] - pips_x[i-1])/ pips_x[i-1] ) * 100
      y = ((pips_y[i] - pips_y[i-1])/ pips_y[i-1] ) * 100

      z = y / x

      points.append(z)





    return points



#creating points dataset
def create_dataset(csv_file,no_of_points_in_raw,window_size=24,step=5,name="dataset.csv"):

    #load data
    data = pd.read_csv(csv_file)
    data['date'] = data['date'].astype('datetime64[s]')
    data = data.set_index('date')


    #dataset
    x = data['close'].to_numpy()



    #create dataset
    dataset = pd.DataFrame()


    # Iterate through the data
    for i in range(0, len(x) - window_size + 1, step):
        window_data = x[i:i+window_size]


        #get points
        points = pips(window_data, no_of_points_in_raw, 2)


        #creating the new record
        new_record = {
            'points': points

        }

        # Convert the new record to a DataFrame
        new_df = pd.DataFrame([new_record])

        # Concatenate the existing DataFrame with the new record DataFrame
        dataset = pd.concat([dataset, new_df], ignore_index=True)

    # Save the updated DataFrame to CSV
    dataset.to_csv(name, index=False)

    #return
    return dataset


def generate_unique_random_numbers(start, end, count):
    if end - start + 1 < count:
        raise ValueError("Count must be less than or equal to the range of numbers.")
    return random.sample(range(start, end + 1), count)


#clustering and saving clusters data
def cluster_and_save(dataset, n_clusters,folder="clusters"):
    # Preprocess data (scaling)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_data)

    # Create a folder to save clusters if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # Convert numpy array to DataFrame
    dataset_df = pd.DataFrame(dataset, columns=[f'Feature_{i+1}' for i in range(dataset.shape[1])])

    # Iterate over each cluster
    for cluster_num in range(n_clusters):
        # Filter data points belonging to the current cluster
        cluster_data = dataset_df[cluster_labels == cluster_num]

        # Convert each row to a list and save as a single row containing a list of features
        cluster_data_list = cluster_data.values.tolist()

        #id = generate_unique_random_numbers(0,999,cluster_num+1)
        # Save the cluster data to a file
        #file_name = f'cluster_{cluster_num + 1}_{id}.csv'

        # Save the cluster data to a file
        file_name = f'cluster_{cluster_num + 1}.csv'
        file_path = os.path.join(folder, file_name)
        with open(file_path, 'w') as file:
            for row in cluster_data_list:
                file.write(','.join(map(str, row)) + '\n')
        print(f"Cluster {cluster_num + 1} saved to: {file_path}")


#determine the best number of clusters
def determine_optimal_clusters(dataset, cluster_range):
    # Preprocess data (scaling)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)

    # Initialize variables to store silhouette scores
    silhouette_scores = []

    # Iterate over different number of clusters
    for n_clusters in cluster_range:
        # Apply K-means clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(scaled_data)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(scaled_data, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    # Find the number of clusters with the highest silhouette score
    optimal_clusters = cluster_range[np.argmax(silhouette_scores)]

    return optimal_clusters




#full clustering sequence
def clustering(data_file,row_points,window,step,range_clusters=0,no_of_clusters=0,name_dataset="dataset.csv",folder="clusters"):

  #preprocessing
  create_dataset(data_file,row_points,window,step,name_dataset)

  print("preprocessing done !")

  # Load dataset from CSV
  dataset = pd.read_csv(name_dataset)

  # Convert string representation of points to lists of floats
  dataset['points'] = dataset['points'].apply(eval)

  # Convert points to numpy array
  points_array = np.array(dataset['points'].tolist())

  print("load points done !")


  if no_of_clusters == 0 :

    # Load dataset from CSV
    dataset = pd.read_csv(name_dataset)

    # Convert string representation of points to lists of floats
    dataset['points'] = dataset['points'].apply(eval)



    # Convert points to numpy array
    points_array = np.array(dataset['points'].tolist())



    # Define range of cluster numbers to test
    cluster_range = range_clusters

    # Determine the optimal number of clusters using the Silhouette Method
    optimal_clusters = determine_optimal_clusters(points_array, cluster_range)

    print("Optimal number of clusters:", optimal_clusters)



    # Define the number of clusters
    n_clusters = int(optimal_clusters)


  elif no_of_clusters != 0 :
    print("start clustering to the chosen clusters number")
    n_clusters = no_of_clusters

  else:
    print("error please choose a range or a number of cluster to start clustering !")

  print("start clustering")

  # Perform clustering and save clusters
  cluster_and_save(points_array, n_clusters,folder)

  print("clustering done !")


#to plot clusters and visualize patterns
def plot_clusters(cluster_folder='clusters',points=5):
    # Get list of cluster files
    cluster_files = [file for file in os.listdir(cluster_folder) if file.startswith('cluster_')]

    # Plot each cluster separately
    for file in cluster_files:
        # Read cluster data from CSV
        cluster_data = pd.read_csv(os.path.join(cluster_folder, file))

        # Plot cluster data
        plt.figure(figsize=(10, points+1))
        for i in range(len(cluster_data)):
            plt.plot(cluster_data.iloc[i].values, label=f'Record {i+1}')  # Plot each record separately

        plt.xlabel('Feature Index')
        plt.ylabel('Feature Value')
        plt.title(f'Cluster {file[:-4]} Records Overlay Plot')

        plt.grid(True)
        plt.show()


#to save plots

def save_cluster_plots(cluster_folder='clusters', points=5, save_folder='cluster_plots'):



    #clear old images
    clear_clusters(save_folder)


    if not os.path.isdir(save_folder):
        os.makedirs(save_folder)

    # Get list of cluster files
    cluster_files = [file for file in os.listdir(cluster_folder) if file.startswith('cluster_')]

    # Plot each cluster separately
    for file in cluster_files:
        # Read cluster data from CSV
        cluster_data = pd.read_csv(os.path.join(cluster_folder, file))

        # Plot cluster data
        plt.figure(figsize=(10, points+1))
        for i in range(len(cluster_data)):
            plt.plot(cluster_data.iloc[i].values, label=f'Record {i+1}')  # Plot each record separately

        plt.xlabel('Feature Index')
        plt.ylabel('Feature Value')
        plt.title(f'Cluster {file[:-4]} Records Overlay Plot')

        plt.grid(True)

        plt.savefig(os.path.join(save_folder, f'cluster_{file[:-4]}.png'))  # Save plot as PNG file
        plt.close()




def load_clusters(folder_path, no_of_clusters):
    """Load cluster data from CSV files in the specified folder."""
    cluster_data = []
    # Get a list of files in the folder
    files = os.listdir(folder_path)
    # Iterate over the files
    for file_name in files:
        # Check if the file is a CSV file and matches the cluster naming convention
        if file_name.endswith('.csv') and file_name.startswith('cluster_'):
            # Extract the cluster number from the file name
            cluster_number = int(file_name.split('_')[1].split('.')[0])
            # Check if the cluster number is within the specified range
            if cluster_number <= no_of_clusters:
                file_path = os.path.join(folder_path, file_name)
                cluster_df = pd.read_csv(file_path)
                cluster_data.append(cluster_df)
    return cluster_data



#count clusters

def count_cluster(folder_path="clusters"):
    if not os.path.isdir(folder_path):
        return "Invalid folder path"

    file_count = 0
    for file in os.listdir(folder_path):
        if file.startswith("cluster_") and file.endswith(".csv"):
            file_count += 1


    print(f'number of cluster = {file_count}')
    return file_count


#delete the empty clusters

def delete_empty(folder_path="clusters"):
    if not os.path.isdir(folder_path):
        return "Invalid folder path"

    files_to_rename = []
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)
            with open(file_path, 'r') as csv_file:
                csv_reader = csv.reader(csv_file)
                num_records = sum(1 for _ in csv_reader)  # Count the number of records
                if num_records < 2:
                    os.remove(file_path)
                    print(f"Deleted {file} due to less than 2 records.")
                else:
                    files_to_rename.append(file)


#to organize if needed

def organize(folder_path="clusters"):
  data = os.path.abspath(folder_path)
  for i, f in enumerate(os.listdir(data)):
      src = os.path.join(data, f)
      dst = os.path.join(data, (f"cluster_{str(i + 1)}.csv"))
      os.rename(src, dst)







#to delete empty

def clear_clusters(folder_path="clusters"):
    if not os.path.isdir(folder_path):
        print("Folder does not exist.")
        return

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)  # Delete file
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Delete directory and its contents
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")





#to download plots directly

def download_plots(folder="cluster_plots"):
  files.download(f"{folder}")



#to get elements list in some folder

def list_files_in_folder(folder_path):
    files_list = []
    for file in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file)):
            files_list.append(file)
    return files_list





#second stage clustering
def secondary_clusering (main_clusters_folder="clusters",folder_name= "secondary clusters",clusters_range=range(1,30) ):

  main_clusters = list_files_in_folder(main_clusters_folder)

  for i in main_clusters :


    data = i


    n_clusters = determine_optimal_clusters(data,clusters_range)

    print(f"number of clusters : {n_clusters} for data : {data}")

    #clustering
    cluster_and_save(data,n_clusters,folder_name)

    print(f"clustering done for : {data}")

    #cleaning empty clusters
    delete_empty(folder_name)


    #get the number of clusters after cleaning
    clusters = count_cluster(folder_name)





#dtermine clusters number for second stage
def determine_optimal_clusters2(dataset, cluster_range):
    """
    Determine the optimal number of clusters using silhouette score.

    Parameters:
        dataset (DataFrame): Input data for clustering.
        cluster_range (range): Range of number of clusters to consider.

    Returns:
        int: Optimal number of clusters.
    """
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)
    silhouette_scores = []

    for n_clusters in cluster_range:
        if n_clusters < len(dataset):
            kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
            cluster_labels = kmeans.fit_predict(scaled_data)
            silhouette_avg = silhouette_score(scaled_data, cluster_labels)
            silhouette_scores.append(silhouette_avg)
        else:
            silhouette_scores.append(-1)  # Flag indicating the number of clusters is too large

    optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
    return optimal_clusters if optimal_clusters < len(dataset) else len(dataset) - 1


#saving clusters in second stage
def cluster_and_save2(dataset, n_clusters, folder="clusters", file_name="",row_points=5):
    """
    Cluster the data and save clusters to CSV files.

    Parameters:
        dataset (DataFrame): Input data for clustering.
        n_clusters (int): Number of clusters.
        folder (str): Folder path to save the cluster CSV files.
        file_name (str): Name of the input file.
    """
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_data)

    if not os.path.exists(folder):
        os.makedirs(folder)

    dataset_df = pd.DataFrame(dataset)

    for cluster_num in range(n_clusters):
        cluster_data = dataset_df[cluster_labels == cluster_num]
        cluster_data_list = cluster_data.values.tolist()
        file_name_out = f"{folder}/{os.path.basename(file_name)}_cluster_{cluster_num + 1}.csv"
        with open(file_name_out, 'w') as file:
            for row in cluster_data_list:
                file.write(','.join(map(str, row)) + '\n')
        print(f"Cluster {cluster_num + 1} saved to: {file_name_out}")


#second stage clustering
def secondary_clustering(main_clusters_folder="clusters", secondary_clusters_folder="secondary_clusters", clusters_range=range(1, 30),row_points=5):
    """
    Perform secondary clustering on files in a folder.

    Parameters:
        main_clusters_folder (str): Path to the folder containing main cluster files.
        secondary_clusters_folder (str): Path to the folder to save secondary cluster files.
        clusters_range (range): Range of number of clusters to consider.
    """
    main_clusters = os.listdir(main_clusters_folder)

    for file_name in main_clusters:
        file_path = os.path.join(main_clusters_folder, file_name)
        data = pd.read_csv(file_path, header=None)


        length_data = len(data)


        print(f"Data shape for file {file_name}: {data.shape}")
        print(f"Data content for file {file_name}:\n{data.head()}")




        # dynamic range
        if length_data > 500 :
          print("data length is big , dynamic range activated")
          clusters_range = range(clusters_range.start+5,clusters_range.stop+10)
        elif length_data > 1500 :
          clusters_range = range(clusters_range.start+10,clusters_range.stop+20)
        elif length_data > 2500 :
          clusters_range = range(clusters_range.start+20,clusters_range.stop+20)
        elif length_data > 3500 :
          clusters_range = range(clusters_range.start+30,clusters_range.stop+30)

        print(f"clusters number range : {clusters_range}")


        n_clusters = determine_optimal_clusters2(data, clusters_range)
        print(f"Optimal number of clusters: {n_clusters} for data: {file_name}")



        if n_clusters < data.shape[0]:
            cluster_and_save2(data, n_clusters, secondary_clusters_folder, file_name,row_points)
            print(f"Clustering done for: {file_name}")
        else:
            print(f"Error: Number of clusters is equal to or greater than the number of samples.")


#combine several clusters in dataset
def combine_secondary_clusters(folder_path, output_file):
    """
    Combine secondary cluster CSV files into a single dataframe.

    Parameters:
        folder_path (str): Path to the folder containing secondary cluster CSV files.
        output_file (str): Name of the output CSV file.
    """
    # List to hold dataframes
    dfs = []

    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):

            # Extract cluster numbers from the file name using regular expressions
            numbers = re.findall(r'\d+', filename)
            cluster_number = '_'.join(numbers)

            # Read CSV file into a dataframe
            df = pd.read_csv(os.path.join(folder_path, filename), header=None)

            # Add a new column to identify the cluster
            df['cluster'] = cluster_number

            # Append the dataframe to the list
            dfs.append(df)

        else:

          print(f"error in file {filename}")


    # Concatenate all dataframes into a single dataframe
    combined_df = pd.concat(dfs, ignore_index=True)

    # Reorder columns to have the Cluster column as the last column
    cols = combined_df.columns.tolist()
    cols.remove('cluster')
    cols.append('cluster')
    combined_df = combined_df[cols]

    # Save the combined dataframe to a new CSV file
    combined_df.to_csv(output_file, index=False)

    # Print the first few rows of the combined dataframe
    #print(combined_df.head())



#detection using algorithm
#calculate clusters

def calculate_unique_clusters(data_file):
    # Load the data from CSV
    data = pd.read_csv(data_file)

    # Extract clusters
    clusters = data['cluster'].values

    # Get the unique clusters present in the data
    unique_clusters = np.unique(clusters)

    return len(unique_clusters)


#full clustering
def pattern_clustering (data_file,row_points,window_size,step_size,range_main,range_secondary):

  #clear old clusters
  clear_clusters("clusters")

  #clustering
  clustering(data_file ,row_points ,window_size ,step_size ,range_main )

  #cleaning empty clusters
  delete_empty("clusters")


  #get the number of clusters after cleaning
  num_main_clusters = count_cluster("clusters")


  #clear old clusters
  clear_clusters("secondary_clusters")

  #making secondary clusters:
  secondary_clustering("clusters", "secondary_clusters", range_secondary,row_points)


  #cleaning empty clusters
  delete_empty("secondary_clusters")



  #get the number of clusters after cleaning
  num_secondary_clusters = count_cluster("secondary_clusters")


  print(f"""

  number of main cluster : {num_main_clusters} \n
  number of secondary cluster : {num_secondary_clusters}


  """)

  return num_secondary_clusters




#to delete files
def delete(file_path):

  # Check if the file exists before deleting
  if os.path.exists(file_path):
      # Delete the file
      os.remove(file_path)
      print(f"File {file_path} deleted successfully.")
  else:
      print(f"File {file_path} does not exist.")



#for analysing clusters behaviors
def create_future_dataset (crypto_data_file,finder,num):

  # Load crypto data
  df = pd.read_csv(crypto_data_file)
  data = df["close"]



  # Load or initialize last index processed
  try:
      with open('last_index.txt', 'r') as file:
          last_index = int(file.read())
  except FileNotFoundError:
      last_index = 24  # Start from the beginning if last_index.txt does not exist



  # Initialize or load result DataFrame
  try:
      result = pd.read_csv('future_result.csv')
  except FileNotFoundError:
      result = pd.DataFrame()


  long = range(last_index, len(data))

  # Start processing from the last index
  for i in long:
      # Window
      window = data[i - 24:i]
      window = np.array(window).tolist()


      # Next data
      next_data = data[i:i + 25]
      next_data = np.array(next_data).tolist()




      # Get cluster
      cluster = finder.find_cluster_for_record(window)




      # New record to the dataframe
      new_records = {
          'window': [window],
          'cluster': [cluster],
          'next data': [next_data]
      }



      result = pd.concat([result, pd.DataFrame(new_records)], ignore_index=True)

      # Save last index processed
      with open('last_index.txt', 'w') as file:
          file.write(str(i))

      # Save progress after processing each record
      result.to_csv('future_result.csv', index=False)

      # Print loading bar
      #print_loading_bar(i + 1, len(long),103)


  delete('last_index.txt')
  print("Processing complete, future result created")

  return result


#to create analysis data frame
def create_clusters_analysis(future_dataset,pom=-1,window=24):

    #the point of measuring
    if pom == -1 :
      change = pom
    elif pom > window :
      chnage = -1
    else :
      change = pom - 1

    # Read the CSV file into a DataFrame
    if isinstance(future_dataset, pd.DataFrame):
        df = future_dataset
    else:
        df = pd.read_csv(future_dataset)

    # Initialize dictionaries to store calculations for each cluster
    cluster_analysis = {}

    # Iterate over unique clusters and calculate analysis
    for cluster_id, group in df.groupby('cluster'):
        window_changes = []
        next_data_changes = []

        # Iterate over rows in the group
        for index, row in group.iterrows():
            # Convert string representations of lists to actual lists
            window = ast.literal_eval(row['window'])
            next_data = ast.literal_eval(row['next data'])

            # Check if window and next_data arrays are not empty
            if len(window) > 0 and len(next_data) > 0:


                if len(window) > change:
                  # Calculate changes in window records
                  window_change = window[change] - window[0]
                  window_changes.append(window_change)
                else :
                  # Calculate changes in window records
                  window_change = window[-1] - window[0]
                  window_changes.append(window_change)


                if len(next_data) > change:
                  # Calculate changes in next data
                  next_data_change = next_data[change] - next_data[0]
                  next_data_changes.append(next_data_change)
                else :
                  # Calculate changes in next data
                  next_data_change = next_data[-1] - next_data[0]
                  next_data_changes.append(next_data_change)






        if len(window_changes) > 0 and len(next_data_changes) > 0:
            # Calculate required statistics for window records changes
            avg_window_change = np.mean(window_changes)
            pos_window_percentage = (sum(1 for change in window_changes if change > 0) / len(window_changes)) * 100
            neg_window_percentage = (sum(1 for change in window_changes if change < 0) / len(window_changes)) * 100
            max_window_change = max(window_changes)
            min_window_change = min(window_changes)

            # Calculate required statistics for next data changes
            avg_next_data_change = np.mean(next_data_changes)
            pos_next_data_percentage = (sum(1 for change in next_data_changes if change > 0) / len(next_data_changes)) * 100
            neg_next_data_percentage = (sum(1 for change in next_data_changes if change < 0) / len(next_data_changes)) * 100
            max_next_data_change = max(next_data_changes)
            min_next_data_change = min(next_data_changes)

            # Calculate percentage of appearance of the cluster
            appearance_percentage = (len(group) / len(df)) * 100

            # Store analysis in dictionary
            cluster_analysis[cluster_id] = {
                'Average Change in Window': avg_window_change,
                'Average Change in Next Data': avg_next_data_change,
                'Positive Change Percentage in Window': pos_window_percentage,
                'Positive Change Percentage in Next Data': pos_next_data_percentage,
                'Negative Change Percentage in Window': neg_window_percentage,
                'Negative Change Percentage in Next Data': neg_next_data_percentage,
                'Max Change in Window': max_window_change,
                'Max Change in Next Data': max_next_data_change,
                'Min Change in Window': min_window_change,
                'Min Change in Next Data': min_next_data_change,
                'Appearance Percentage': appearance_percentage
            }

    # Convert the dictionary to a DataFrame
    df_analysis = pd.DataFrame.from_dict(cluster_analysis, orient='index')

    # Save DataFrame to CSV
    df_analysis.to_csv('clusters_analysis.csv', index_label='cluster')

    print("Clusters analysis created successfully")

    return df_analysis



#to get behaiors data frame
def get_clusters_behavior(threshold,average, analysis_file):

     # Read the CSV file into a DataFrame
    if isinstance(analysis_file, pd.DataFrame):
        df = analysis_file
    else:
        df = pd.read_csv(analysis_file, index_col='cluster')


    # Initialize lists to store cluster behaviors
    window_behavior = []
    next_data_behavior = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():


        # Determine behavior in window based on threshold
        if row['Positive Change Percentage in Window'] > threshold and row['Average Change in Window'] > average:
            window_behavior.append('Up Trend')
        elif row['Negative Change Percentage in Window'] > threshold and row['Average Change in Window'] < (- average) :
            window_behavior.append('Down Trend')
        else:
            window_behavior.append('Mixed Trend')

        # Determine behavior in next data based on threshold
        if row['Positive Change Percentage in Next Data'] > threshold and row['Average Change in Next Data'] > average:
            next_data_behavior.append('Up Trend')
        elif row['Negative Change Percentage in Next Data'] > threshold and row['Average Change in Next Data'] < (- average) :
            next_data_behavior.append('Down Trend')
        else:
            next_data_behavior.append('Mixed Trend')

    # Create a new DataFrame for cluster behavior
    behavior_df = pd.DataFrame({
        'Cluster': df.index,
        'Window Behavior': window_behavior,
        'Next Data Behavior': next_data_behavior,
        'Appearance Percentage': df['Appearance Percentage']
    })

    # Save the DataFrame to a CSV file
    behavior_df.to_csv('clusters_behavior.csv', index=False)

    return behavior_df



#group clusters
def cluster_groups_nextdata(cluster_behavior_file):

    # Read the CSV file into a DataFrame
    if isinstance(cluster_behavior_file, pd.DataFrame):
        cluster_behavior_df = cluster_behavior_file
    else:
        cluster_behavior_df = pd.read_csv(cluster_behavior_file)

    # Filter clusters based on behavior in next data
    positive_clusters_next = cluster_behavior_df[cluster_behavior_df['Next Data Behavior'] == 'Up Trend']['Cluster'].tolist()
    negative_clusters_next = cluster_behavior_df[cluster_behavior_df['Next Data Behavior'] == 'Down Trend']['Cluster'].tolist()

    return positive_clusters_next, negative_clusters_next

def cluster_groups_window(cluster_behavior_file):

    # Read the CSV file into a DataFrame
    if isinstance(cluster_behavior_file, pd.DataFrame):
        cluster_behavior_df = cluster_behavior_file
    else:
        cluster_behavior_df = pd.read_csv(cluster_behavior_file)

    # Filter clusters based on behavior in window
    positive_clusters_window = cluster_behavior_df[cluster_behavior_df['Window Behavior'] == 'Up Trend']['Cluster'].tolist()
    negative_clusters_window = cluster_behavior_df[cluster_behavior_df['Window Behavior'] == 'Down Trend']['Cluster'].tolist()

    return positive_clusters_window, negative_clusters_window


#get signal
def get_signal(window,positive_clusters,negative_clusters,finder):


  # Get cluster
  cluster = finder.find_cluster_for_record(window)

  if cluster in positive_clusters:
    decision = "buy"
  elif cluster in negative_clusters :
    decision = "sell"
  else :
    decision = "hold"

  return decision , cluster


#creating sequence
def create_sequence (crypto_data_file,clusters_dataset,row_points,finder):


  # Load crypto data
  df = pd.read_csv(crypto_data_file)
  data = df["close"]



  # Load or initialize last index processed
  try:
      with open('last_index_sequence.txt', 'r') as file:
          last_index = int(file.read())
  except FileNotFoundError:
      last_index = 24  # Start from the beginning if last_index.txt does not exist



  # Initialize or load result DataFrame
  try:
      result = pd.read_csv('sequence.csv')
  except FileNotFoundError:
      result = pd.DataFrame()


  # Get number of clusters
  num = calculate_unique_clusters(clusters_dataset)


  # Start processing from the last index
  for i in range(last_index, len(data)):
      # Window
      window = data[i - 24:i].values

      # Get cluster
      cluster = finder.find_cluster_for_record(window)

      # New record to the dataframe
      new_records = {
          'cluster': [cluster],
      }

      #print(cluster)

      result = pd.concat([result, pd.DataFrame(new_records)], ignore_index=True)

      # Save last index processed
      with open('last_index_sequence.txt', 'w') as file:
          file.write(str(i))

      # Save progress after processing each record
      result.to_csv('sequence.csv', index=False)


  delete('last_index_sequence.txt')
  print("Processing complete, sequence file created")

  return result



#behavior chart
def plot_behavior_chart(cluster_behavior_file):


    # Read the CSV file into a DataFrame
    if isinstance(cluster_behavior_file, str):
        cluster_behavior_df = pd.read_csv(cluster_behavior_file)
    else:
        cluster_behavior_df = cluster_behavior_file

    # Read the clusters behavior CSV file into a DataFrame
    cluster_behavior_df = pd.read_csv(cluster_behavior_file)

    # Separate window and next data behavior
    window_behavior_df = cluster_behavior_df[['Window Behavior', 'Appearance Percentage']]
    next_data_behavior_df = cluster_behavior_df[['Next Data Behavior', 'Appearance Percentage']]

    # Calculate the total appearance for each behavior in window and next data
    window_up_trend = window_behavior_df[window_behavior_df['Window Behavior'] == 'Up Trend']['Appearance Percentage'].sum()
    window_down_trend = window_behavior_df[window_behavior_df['Window Behavior'] == 'Down Trend']['Appearance Percentage'].sum()
    window_mixed_trend = window_behavior_df[window_behavior_df['Window Behavior'] == 'Mixed Trend']['Appearance Percentage'].sum()

    next_data_up_trend = next_data_behavior_df[next_data_behavior_df['Next Data Behavior'] == 'Up Trend']['Appearance Percentage'].sum()
    next_data_down_trend = next_data_behavior_df[next_data_behavior_df['Next Data Behavior'] == 'Down Trend']['Appearance Percentage'].sum()
    next_data_mixed_trend = next_data_behavior_df[next_data_behavior_df['Next Data Behavior'] == 'Mixed Trend']['Appearance Percentage'].sum()

    # Plot pie chart for window behavior
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    labels = ['Up Trend', 'Down Trend', 'Mixed Trend']
    sizes = [window_up_trend, window_down_trend, window_mixed_trend]
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title('Window Behavior')

    # Plot pie chart for next data behavior
    plt.subplot(1, 2, 2)
    sizes = [next_data_up_trend, next_data_down_trend, next_data_mixed_trend]
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title('Next Data Behavior')

    plt.tight_layout()
    plt.show()


#to delete unneeded files

def delete_files(items):
    for item in items:
        if os.path.exists(item):
            if os.path.isfile(item):
                os.remove(item)
                print(f"File '{item}' deleted successfully.")
            elif os.path.isdir(item):
                delete_files_in_directory(item)
                os.rmdir(item)
                print(f"Directory '{item}' deleted successfully.")
            else:
                print(f"'{item}' is neither a file nor a directory.")
        else:
            print(f"'{item}' does not exist.")

def delete_files_in_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            os.remove(file_path)
            print(f"File '{file_path}' deleted successfully.")


#check folder

def folder_check(path):
  if os.path.exists(path):
      return True
  else:
      return False




#backup data

def create_backup_zip(file_list, zip_file_name):
    # Create a ZipFile object in write mode
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        # Iterate through the list of files and directories
        for item in file_list:
            # Check if the item exists
            if os.path.exists(item):
                # If the item is a directory, add its contents recursively
                if os.path.isdir(item):
                    for foldername, subfolders, filenames in os.walk(item):
                        for filename in filenames:
                            file_path = os.path.join(foldername, filename)
                            zipf.write(file_path, os.path.relpath(file_path, os.path.dirname(item)))
                else:
                    # If the item is a file, add it directly
                    zipf.write(item, os.path.basename(item))
            else:
                print(f"Warning: '{item}' not found.")

    print(f"Zip file '{zip_file_name}' created successfully.")



#extract sequence
def extract_sequence(data):
    # If data is a DataFrame, directly extract the 'cluster' column
    if isinstance(data, pd.DataFrame):
        return data['cluster'].tolist()
    # If data is a CSV file path, read the CSV file and then extract the 'cluster' column
    elif isinstance(data, str) and data.endswith('.csv'):
        df = pd.read_csv(data)
        return df['cluster'].tolist()
    else:
        raise ValueError("Input data must be a DataFrame or a CSV file path ending with '.csv'")


#info
def get_cluster_info(dataframe, cluster_name):


    # If data is a CSV file path, read the CSV file and then extract the 'cluster' column
    if isinstance(dataframe, str) and data.endswith('.csv'):
       dataframe = pd.read_csv(dataframe)

    # Search for the cluster name in the 'Cluster' column
    cluster_row = dataframe[dataframe['Cluster'] == cluster_name]

    # Check if the cluster exists
    if len(cluster_row) == 0:
        print(f"Cluster '{cluster_name}' not found.")
        return None

    # Extract information from the row
    window_behavior = cluster_row['Window Behavior'].iloc[0]
    next_data_behavior = cluster_row['Next Data Behavior'].iloc[0]
    appearance_percentage = cluster_row['Appearance Percentage'].iloc[0]

    return window_behavior, next_data_behavior, appearance_percentage




def search_cluster(dataframe, cluster_id):
    # Filter the DataFrame based on the cluster ID
    cluster_data = dataframe[dataframe.index.str.startswith(cluster_id)]

    # Return the columns of the cluster
    return cluster_data





def print_loading_bar(progress, total, bar_length=50):
    """
    Print a loading bar based on the progress and total steps.

    Parameters:
        progress (int): The current progress.
        total (int): The total steps.
        bar_length (int): The length of the loading bar.
    """
    percent = progress / total
    arrow = '>' * int(round(percent * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))
    sys.stdout.write(f'\r[{arrow + spaces}] {int(percent * 100)}%')
    sys.stdout.flush()

















#classes


#creating tree from sequence
class FBTree:
    def __init__(self):
        self.tree = {}

    def build_tree(self, sequence):

        if isinstance(sequence, list):
          print("sequence loaded")
        else:
          print("converting dataframe into sequence")

          try :
            sequence = pd.read_csv(sequence)
            sequence = sequence['cluster'].tolist()
          except:
            try :
              sequence = sequence['cluster'].tolist()
            except :
              print("error converting dataframe to sequence, pleasae check the datafram")


        for i in range(len(sequence) - 1):
            current = self.tree
            current = current.setdefault(sequence[i], {})
            current.setdefault(sequence[i+1], 0)
            current[sequence[i+1]] += 1

    def save_tree(self, filename):
        with open(filename, 'w') as file:
            json.dump(self.tree, file)

        return filename

    def load_tree(self, filename):
        with open(filename, 'r') as file:
            self.tree = json.load(file)

    def get_next_steps(self, data):
        if data not in self.tree:
            return {}
        current = self.tree[data]
        total_count = sum(current.values())
        next_steps = {key: count / total_count for key, count in current.items()}

        df = pd.DataFrame(list(next_steps.items()), columns=['name', 'percentage'])
        df['percentage'] *= 100
        return df

    def display_tree(self):
        self._display_tree_recursive(self.tree, '', '')

    def _display_tree_recursive(self, node, prefix, last_prefix):
        if isinstance(node, int):  # Handle case where node is an integer value
            return
        keys = list(node.keys())
        for i, key in enumerate(keys):
            is_last = i == len(keys) - 1
            print(prefix + ("└── " if is_last else "├── ") + str(key))
            new_prefix = last_prefix + ("    " if is_last else "│   ")
            new_last_prefix = last_prefix + ("    " if is_last else "│   ")
            self._display_tree_recursive(node[key], new_prefix, new_last_prefix)








class ClusterFinder:
    def __init__(self, data_file, num_clusters, save_file):
        self.data = pd.read_csv(data_file)
        self.features = self.data.iloc[:, :-1].values
        self.clusters = self.data['cluster'].values
        self.tree = KDTree(self.features)
        self.save_file = save_file

    def save_tree(self):
        with open(self.save_file, 'wb') as f:
            pickle.dump(self.tree, f)

    def load_tree(self,file_name):
        with open(file_name, 'rb') as f:
            self.tree = pickle.load(f)


    def find_cluster_for_record(self, record):

        #get points
        data = pips(record, row_points, 2)

        #find cluster
        _, indices = self.tree.query([data], k=1)
        closest_cluster = self.clusters[indices[0][0]]
        return closest_cluster

    def find_cluster_for_pips(self, record):

        _, indices = self.tree.query([record], k=1)
        closest_cluster = self.clusters[indices[0][0]]
        return closest_cluster








#main class

class cluster_analysis:

    def __init__(self,name,data_file,row_points,window_size,step_size,range_main,range_secondary,threshold = 70,average=0.5,pom=-1 ):
        #name
        self.name = name
        #variable
        self.data_file = data_file
        self.row_points = row_points
        self.window_size = window_size
        self.step_size = step_size
        self.range_main = range_main
        self.range_secondary = range_secondary
        self.threshold = threshold
        self.ave_threshold = average
        self.pom = pom
        #defult variables
        self.clusters_folder= "secondary_clusters"
        self.clusters_dataset = "clusters_dataset.csv"
        self.future_dataset = "future_result.csv"
        self.treefile = "clusters_tree.json"
        self.finder_file = f"{self.name}_cluster_finder.pkl"
        self.behavior_file = "clusters_behavior.csv"
        #variable init
        self.num = 0
        self.cluster_finder = ""
        self.future_dataset_data = ""
        self.clusters_analysis = ""
        self.clusters_behavior = ""
        self.clusters_sequence = []
        self.fb_tree = ""
        #output
        self.positive_clusters_next = []
        self.negative_clusters_next = []
        self.positive_clusters_window = []
        self.negative_clusters_window = []
        #time
        self.train_time = 0
        #backtest results
        self.backtest_results = 0

      # to save instance _________________________________________________
    def save(self):
        # Save the instance to a file
        with open(f"{self.name}.pkl", "wb") as file:
            pickle.dump(self, file)
        print(f"{self.name} saved successfully" )

    def load(self,obj_file):
        # Load the instance from the file
        with open(obj_file, "rb") as file:
            loaded_instance = pickle.load(file)
        self.__dict__.update(loaded_instance.__dict__)
        print(f"{obj_file} loaded successfully" )

    # core functions  _________________________________________________
    def train(self):
        #get time
        start_time = time.time()
        #to cluster data
        self.num = pattern_clustering(self.data_file,self.row_points,self.window_size,self.step_size,self.range_main,self.range_secondary)
        print("clutsering done successfully")
        # create the dataframe for training the model for secondary clusters:
        combine_secondary_clusters(self.clusters_folder, self.clusters_dataset)
        print("dataset created successfully")
        # Get number of clusters
        self.num = calculate_unique_clusters(self.clusters_dataset)
        # create instance from ClusterFinder
        self.cluster_finder = ClusterFinder(self.clusters_dataset, self.num, self.finder_file)
        print("cluster finder created successfully")
        # check the future results for every cluster
        print("backtest started")
        self.future_dataset_data = create_future_dataset(self.data_file,self.cluster_finder,self.num)
        print("backtesting done successfully")
        # create clusters analysis
        self.clusters_analysis = create_clusters_analysis(self.future_dataset,self.pom,self.window_size)
        print("backtest analysed successfully")
        #get cluster behavior
        self.clusters_behavior = get_clusters_behavior(self.threshold,self.ave_threshold, self.clusters_analysis)
        print("behaviors extracted successfully")
        #group clusters for strategy
        self.positive_clusters_next, self.negative_clusters_next = cluster_groups_nextdata(self.clusters_behavior)
        self.positive_clusters_window, self.negative_clusters_window = cluster_groups_window(self.clusters_behavior)
        print("clusters groups created successfully")
        #create clusters sequence
        self.clusters_sequence = extract_sequence(self.future_dataset_data)
        #creating the fb tree of clusters
        self.fb_tree = FBTree()
        self.fb_tree.build_tree(self.clusters_sequence)
        print("sequence fb tree created successfully")
        #print
        print("**********training done successfully**********")
        #get time
        end_time = time.time()
        self.train_time = end_time - start_time
        print(f"train time : {self.train_time/60} min")

    def get_signals(self,records):
        #to use into the backtest
        signal , current_cluster = get_signal(records,self.positive_clusters_next,self.negative_clusters_next,self.cluster_finder)
        #print results
        print(f"signal : {signal} , current cluster {current_cluster} ")
        #return
        return signal , current_cluster


    def get_info(self,cluster):
      #get cluster info
      info = search_cluster(self.clusters_analysis,cluster)
      return info


    def clean_files(self):
        #delete files
        file_list=["last_index.txt","clusters","clusters_analysis.csv" ,  "dataset.csv" , "future_result.csv" ,"clusters_dataset.csv","clusters_behavior.csv","secondary_clusters" ]
        delete_files(file_list)


    def plot_behavior(self):
        #plot behavior
        plot_behavior_chart(self.behavior_file)

    def save_cluster_finder(self):
        # Save the KDTree to a file
        self.cluster_finder.save_tree()
        #print
        print("finder saved succesfully")

    def load_cluster_finder(self,file_name):
        #to load cluster finder
        self.cluster_finder.load_tree(file_name)
        print(f"{file_name} loaded successfully" )

    def plot_patterns(self):
        #plot patterns
        plot_clusters(self.clusters_folder,self.row_points)


    def save_patterns_plots(self):
        #save plots
        save_cluster_plots(self.clusters_folder,self.row_points)
        print(f"plots saved successfully" )

    def continue_future_test(self):
        # check the future results for every cluster
        self.future_dataset_data = create_future_dataset(self.data_file,self.cluster_finder,self.num)
        # create clusters analysis
        self.clusters_analysis = create_clusters_analysis(self.future_dataset)
        #get cluster behavior
        self.clusters_behavior = get_clusters_behavior(self.threshold,self.ave_threshold, self.clusters_analysis)
        #group clusters for strategy
        self.positive_clusters_next, self.negative_clusters_next = cluster_groups_nextdata(self.clusters_behavior)
        self.positive_clusters_window, self.negative_clusters_window = cluster_groups_window(self.clusters_behavior)
        #create clusters sequence
        self.clusters_sequence = extract_sequence(self.future_dataset_data)
        #creating the fb tree of clusters
        self.fb_tree = FBTree()
        self.fb_tree.build_tree(self.clusters_sequence)
        print("sequence fb tree created successfully")
        #print
        print("**********training done successfully**********")


    def backup(self):
      file_list = ["last_index.txt", "clusters", "clusters_analysis.csv", "dataset.csv", "future_result.csv", "clusters_dataset.csv", "clusters_behavior.csv", "secondary_clusters"]
      zip_file_name = f"{self.name}_backup.zip"
      create_backup_zip(file_list, zip_file_name)




    #tree functions _________________________________________________
    def save_tree(self):
      self.fb_tree.save_tree(f"{self.name}_sequence_tree.json")
      print(f"{self.name}_sequence_tree saved successfully" )

    def load_tree(self,file_name):
      self.fb_tree.load_tree(file_name)
      print(f"{file_name}_sequence_tree loaded successfully" )

    def next_cluster(self,cluster):
      df = self.fb_tree.get_next_steps(cluster)
      df[['window_behavior', 'next_data_behavior', 'appearance_percentage']] = df['name'].apply(lambda x: pd.Series(get_cluster_info(self.clusters_behavior, x)))
      return df

    def display_tree(self):
      self.fb_tree.display_tree()


    #backtest functions _________________________________________________
    def set_new(self,ave,threshold):
        #setting variables
        self.threshold = threshold
        self.ave_threshold = ave
        #get cluster behavior
        self.clusters_behavior = get_clusters_behavior(self.threshold,self.ave_threshold, self.clusters_analysis)
        #group clusters for strategy
        self.positive_clusters_next, self.negative_clusters_next = cluster_groups_nextdata(self.clusters_behavior)
        self.positive_clusters_window, self.negative_clusters_window = cluster_groups_window(self.clusters_behavior)
        #create clusters sequence
        self.clusters_sequence = extract_sequence(self.future_dataset_data)
        #creating the fb tree of clusters
        self.fb_tree = FBTree()
        self.fb_tree.build_tree(self.clusters_sequence)
        #print
        print("**********changing variables done successfully**********")


















#documentation

alo alo alo ,,,, Ahmed Sleem here, just finished up version one of this module. It's April 9th, 2024, 9:52 am. Oh, and heads up, I haven't slept since yesterday at 10 am. So, there might be some hiccups to sort out in the next versions. But hey, we'll iron those out as we go.

Overview:
This module helps you analyze historical data for a particular currency, giving you insights into its clustering patterns. With this analysis, you can do a bunch of stuff—from making decisions to plotting patterns or understanding its behaviors through numbers. Just make sure to read the comments carefully!

In [None]:
#first you need to your analysis object instance

#the name of the object
name = "sol_analysis"
#the historical data of the currency
data_file = "sol.csv"
#the number of point that will be in every pattern after simplification (you can fine tune it, defult = 5)
row_points = 5
#the window size for each pattern before simplification (you can fine tune it, defult = 24)
window_size = 24
#the steps to skip between every pattern and the next one (you can fine tune it, defult = 10)
step_size = 5
#the object will detect the most suitable number of cluster inside the range you give here (you can fine tune it, defult = (25,45))
range_main = range(25,40)
#for every pattern the object will find internal variations inside it , this is the range of vaiation for every pattern and the object will change the range dynamicly and find the most suitable number of clusters insde the range
range_secondary = range(10,20)
#if the percentage of possitve trend after some pattern is greater than the threshold it will considerd up trend pattern and the opposite for down trend
threshold = 70
#the minimum average of up trend sizes of the clusters for up trend clusters and with negative for down trend ones
min_average = 1
#the point of measuring the change after the cluster detecter (length of trade)
pom = 10

#creating the instance
sol_analysis = cluster_analysis(name,data_file,row_points,window_size,step_size,range_main,range_secondary,threshold,min_average,pom)

In [None]:
#main functionalities :

#train the system
sol_analysis.train()

#if you stoped the training process in the backtesting phase (because it might need sometime to complete) you can continue from the points you stopped in
sol_analysis.continue_future_test()

#to plot the discovered patterns after training
sol_analysis.plot_patterns()

#to save patterns as images in plots folder
sol_analysis.save_patterns_plots()

#to plot the pie chart that represent the percentage of up trend patterns to down trend patterns to mixed trend patterns in the patterns discovered from the historical data
#it is important to predict the trading frequency depend on this analysis
sol_analysis.plot_behavior()

#to save the instance to load it later and dont have to start train it again
sol_analysis.save()

#to delete unwanted files after training and saving the instance you might need to delete unwanted files that created due to training
sol_analysis.clean_files()

#to get signal depend on the result of the analysis simply follow the this example
window = [
    107.57, 107.42, 107.76, 107.75, 107.42, 108.16, 108.09, 108.42, 108.85,
    109.39, 108.87, 108.58, 108.31, 108.07, 107.99, 107.72, 107.39, 107.96,
    107.69, 107.33, 107.1, 106.85, 106.65, 107.02
]
signal , cluster = sol_analysis.get_signals(window)

#get the analysis of certain cluster
info = sol_analysis.get_info(cluster)
#max up trend
max = float(info["Max Change in Next Data"].values[0])
#max down trend
min = float(info["Min Change in Next Data"].values[0])


#to know the future possiblities you should this function , depend on an internal fb tree inside the instance it should tells you the patterns that most likely to happen after the current one and what is its behaviors and the percentage of each of them to be the next one
possible_clusters = sol_analysis.next_cluster(cluster)
print(possible_clusters)

#to load any saved pretrained instances and work with it directly
sol_analysis.load("sol_analysis.pkl")


#backtest data
data_test = "sol test.csv"
#to backtest the strategy based on the result
sol_analysis.backtest(data_test)


#to change the average and threshold after training
new_threshold = 60
new_average = 0.5

sol_analysis.set_new(new_average , new_threshold)



In [None]:
#additional functionalities :

#to save only the finder object that can detect patterns without any other data
sol_analysis.save_cluster_finder()

#backup analysis files like analysis csvs that secure trianing process
sol_analysis.backup()


#to display the fb tree that represent the sequence of patterns that happen after each other
sol_analysis.display_tree()


#to save the fb tree
sol_analysis.save_tree()


#to load saved fb tree
sol_analysis.load_tree()


#to load saved finder file
sol_analysis.load_cluster_finder("sol_analysis_cluster_finder.pkl")


#step by step

In [None]:
#first you need to your analysis object instance

#the name of the object
name = "sol_analysis"
#the historical data of the currency
data_file = "sol.csv"
#the number of point that will be in every pattern after simplification (you can fine tune it, defult = 5)
row_points = 5
#the window size for each pattern before simplification (you can fine tune it, defult = 24)
window_size = 24
#the steps to skip between every pattern and the next one (you can fine tune it, defult = 10)
step_size = 5
#the object will detect the most suitable number of cluster inside the range you give here (you can fine tune it, defult = (25,45))
range_main = range(25,40)
#for every pattern the object will find internal variations inside it , this is the range of vaiation for every pattern and the object will change the range dynamicly and find the most suitable number of clusters insde the range
range_secondary = range(10,20)
#if the percentage of possitve trend after some pattern is greater than the threshold it will considerd up trend pattern and the opposite for down trend
threshold = 70
#the minimum average of up trend sizes of the clusters for up trend clusters and with negative for down trend ones
min_average = 1
#the point of measuring the change after the cluster detecter (length of trade)
pom = 10


#backtest data
data_test = "sol test.csv"

#creating the instance
sol_analysis = cluster_analysis(name,data_file,row_points,window_size,step_size,range_main,range_secondary,threshold,min_average,pom)

In [None]:
#to train the system
sol_analysis.train()

In [None]:
#if the train doesnt finished in one session you can contine from the point you stopped in
sol_analysis.continue_future_test()

In [None]:
#to backtest the strategy based on the result
sol_analysis.backtest(data_test)

In [None]:
#to plot clusters
sol_analysis.plot_patterns()

In [None]:
#to save patterns as images
sol_analysis.save_patterns_plots()

In [None]:
#to plot the pie chart
sol_analysis.plot_behavior()

In [None]:
#to change the average and threshold after training
new_threshold = 60
new_average = 0.5

sol_analysis.set_new(new_average , new_threshold)

In [None]:
#to save finder
sol_analysis.save_cluster_finder()

In [None]:
#to save analysis instance
sol_analysis.save()

In [None]:
#backup analysis files
sol_analysis.backup()

In [None]:
#to display tree
sol_analysis.display_tree()

In [None]:
#to save tree
sol_analysis.save_tree()

In [None]:
#to load tree
sol_analysis.load_tree()

In [None]:
#to delete unwanted files
sol_analysis.clean_files()

In [None]:
#to get signals
window = [
    107.57, 107.42, 107.76, 107.75, 107.42, 108.16, 108.09, 108.42, 108.85,
    109.39, 108.87, 108.58, 108.31, 108.07, 107.99, 107.72, 107.39, 107.96,
    107.69, 107.33, 107.1, 106.85, 106.65, 107.02
]
signal , cluster = sol_analysis.get_signals(window)

In [None]:
#get the analysis of certain cluster
info = sol_analysis.get_info(cluster)


#max up trend
take_profit = float(info["Max Change in Next Data"].values[0])
#max down trend
stop_loss = float(info["Min Change in Next Data"].values[0])

print(take_profit)
print(stop_loss)

9.099999999999994
-4.549999999999997


In [None]:
# to get next possible clusters
possible_clusters = sol_analysis.next_cluster(cluster)
print(possible_clusters)



In [None]:
#to load old analysis file
sol_analysis.load("sol_analysis.pkl")

In [None]:
#to load old finder file
sol_analysis.load_cluster_finder("sol_analysis_cluster_finder.pkl")

#run

In [None]:
# for experment

name = "sol_analysis"
data_file = "TRAIN.csv" #must have close with small c
data_test = "TEST.csv" #  must be a pandas.DataFrame with columns 'Open', 'High', 'Low', 'Close', and (optionally) 'Volume'


row_points = 5
window_size = 24
step_size = 5
range_main = range(45,55)
range_secondary = range(30,50)
threshold = 70
min_average = 0.1
pom = 24




#creating the instance
sol_analysis = cluster_analysis(name,data_file,row_points,window_size,step_size,range_main,range_secondary,threshold,min_average,pom)






sol_analysis.train()
sol_analysis.save()







In [None]:
sol_analysis.continue_future_test()

In [None]:
#to plot the pie chart
sol_analysis.plot_behavior()

In [None]:
#to plot clusters
sol_analysis.plot_patterns()

In [None]:
#to backup all model files
sol_analysis.backup()

In [None]:
#to clean the directory
sol_analysis.clean_files()

In [None]:
#to change the average and threshold after training
new_threshold = 60
new_average = 0.5

sol_analysis.set_new(new_average , new_threshold)

#backtesting

In [None]:
from backtesting import Backtest, Strategy
from backtesting.lib import crossover



class str(Strategy):

    window_len = 24

    def init(self):
        close = self.data.Close


    def next(self):

        #signal
        if len(self.data) >= self.window_len:
          window = np.array(self.data.Close[-self.window_len:])
          signal , cluster = sol_analysis.get_signals(window)

          if "buy" in signal:
              self.buy()
          elif "sell" in signal:
              self.sell()





data = pd.read_csv(data_test)

bt = Backtest(data, str,
              cash=1000, commission=.002,
              exclusive_orders=True)

output = bt.run()
bt.plot()
print(output)

In [None]:
#with stop loss and take profit
from backtesting import Backtest, Strategy
from backtesting.lib import crossover
from backtesting.lib import TrailingStrategy



class str(TrailingStrategy):

    window_len = 24

    trailing = 2
    tpr = 0.2

    def init(self):
        close = self.data.Close


    def next(self):

        #signal
        if len(self.data) >= self.window_len:
          window = np.array(self.data.Close[-self.window_len:])
          signal , cluster = sol_analysis.get_signals(window)

          self.set_trailing_sl((self.trailing))


          if "buy" in signal:

              self.tp = self.data.Close *(1+self.tpr/100)
              self.buy(tp = self.tp)
          elif "sell" in signal:
              self.sell()





data = pd.read_csv(data_test)

bt = Backtest(data, str,
              cash=1000, commission=.002,
              exclusive_orders=True)

output = bt.run()
bt.plot()
print(output)

#whats new

##ahmed sleem

11-april-2024 :
*   faster train
*   reduced code
*   get_info() function
*   time calculation (time of train)

15-april-2024 :
*   custimazed pom (point of measure)

16-april-2024:
*   add backtest
*   fix some issues

17-april-2024:
*   adding average threshold
*   fix sum bugs + fix backtest

18-april-2024:
*   exact pom calculation

19-april-2024:
*   fix long name problem
*   fix naming (soon)
*   calculate possibilities for clusters less than 2 records (soon)
*   fix some bugs in backtest

20-april-2024:
*   reduce training time by 20%
*   add set() to reduce the number of trains
*   adding stop loss and take profite features

















##omar karim

17-april-2024:
best values til now:
*   create our model with parameters = (
  row_points = 6
  window_size = 24
  step_size = 4
  range_main = range(40,50)
  range_secondary = range(20,40)
  threshold = 70
  min_average = 0.25
  pom = 5
  number of main cluster : 39
  number of secondary cluster : 281
)
*   create our model with results = (7.7 up, 6.3 down, 86 mixed)

18-april-2024:
*   multi-layered system (multible freqencies) -> current focus: short frequency
*   backtest and analysis (soon)

