In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
from sklearn.preprocessing import *

# dist_measure
# 1 = Euclidean Distance
# 2 = Perpendicular Distance
# 3 = Vertical Distance
def find_pips(data: np.array, n_pips: int, dist_measure: int):
    pips_x = [0, len(data) - 1]  # Index
    pips_y = [data[0], data[-1]] # Price

    for curr_point in range(2, n_pips):
        md = 0.0 # Max distance
        md_i = -1 # Max distance index
        insert_index = -1
        # Use a single loop to iterate over all the points
        for i in range(1, len(data) - 1):
            left_adj = bisect.bisect_right(pips_x, i) - 1
            right_adj = left_adj + 1
            # Calculate the distance from the point to the line segment
            d = distance(data, pips_x, pips_y, i, left_adj, right_adj, dist_measure)
            # Update the maximum distance and the insert index
            if d > md:
                md = d
                md_i = i
                insert_index = right_adj
        # Insert the new pip
        pips_x.insert(insert_index, md_i)
        pips_y.insert(insert_index, data[md_i])
    return pips_x, pips_y

# Define a helper function to calculate the distance
def distance(data, pips_x, pips_y, i, left_adj, right_adj, dist_measure):
    time_diff = pips_x[right_adj] - pips_x[left_adj]
    price_diff = pips_y[right_adj] - pips_y[left_adj]
    slope = price_diff / time_diff
    intercept = pips_y[left_adj] - pips_x[left_adj] * slope
    dist_funcs = {
        1: lambda x, y: ( (pips_x[left_adj] - x) ** 2 + (pips_y[left_adj] - y) ** 2 ) ** 0.5 + ( (pips_x[right_adj] - x) ** 2 + (pips_y[right_adj] - y) ** 2 ) ** 0.5, # Euclidean distance
        2: lambda x, y: abs( (slope * x + intercept) - y ) / (slope ** 2 + 1) ** 0.5, # Perpendicular distance
        3: lambda x, y: abs( (slope * x + intercept) - y ) # Vertical distance
    }
    return dist_funcs[dist_measure](i, data[i])

In [176]:

data = pd.read_csv("/Users/newuser/Projects/robust_algo_trader/data/gen_oanda_data/GBP_USD_M15_raw_data.csv", parse_dates=['time'])
data = data.set_index('time')

# take data from 2007 to 2014
data = data['2007-01-01':'2008-01-01']

# add new columns from the time column, year, month, day, hour, minute
data['year'] = data.index.year
data['month'] = data.index.month
data['day'] = data.index.day
data['hour'] = data.index.hour
data['minute'] = data.index.minute
data

Unnamed: 0_level_0,open,high,low,close,volume,year,month,day,hour,minute
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2007-01-01 21:30:00+00:00,1.95840,1.95860,1.95840,1.95840,18,2007,1,1,21,30
2007-01-01 21:45:00+00:00,1.95840,1.95965,1.95830,1.95920,47,2007,1,1,21,45
2007-01-01 22:00:00+00:00,1.95910,1.96005,1.95890,1.95978,60,2007,1,1,22,0
2007-01-01 22:15:00+00:00,1.96003,1.96190,1.96003,1.96156,91,2007,1,1,22,15
2007-01-01 22:30:00+00:00,1.96138,1.96174,1.96083,1.96174,82,2007,1,1,22,30
...,...,...,...,...,...,...,...,...,...,...
2008-01-01 22:45:00+00:00,1.98707,1.98725,1.98661,1.98670,45,2008,1,1,22,45
2008-01-01 23:00:00+00:00,1.98670,1.98905,1.98670,1.98905,71,2008,1,1,23,0
2008-01-01 23:15:00+00:00,1.98868,1.98965,1.98868,1.98940,90,2008,1,1,23,15
2008-01-01 23:30:00+00:00,1.98940,1.98975,1.98846,1.98876,70,2008,1,1,23,30


In [None]:
n_close_points = 24
n_perc_points = 4
dist_measure = 1
pips_y_list = []
# loop through the data
# start from the 24th point
for index in range(n_close_points, len(data)):
    x = data['close'].iloc[index-n_close_points:index].to_numpy()
    pips_x, pips_y = find_pips(x, n_perc_points, dist_measure)
    scaled_pips_y = StandardScaler().fit_transform(np.array(pips_y).reshape(-1, 1)).reshape(-1)
    pips_y_list.append(scaled_pips_y)
    # print(scaled_pips_y)
    # pd.Series(x).plot()
    # for i in range(n_perc_points):
    #     plt.plot(pips_x[i], pips_y[i], marker='o', color='red')
    # plt.plot(pips_x, pips_y, color='black')
    # plt.show()
pips_y_df = pd.DataFrame(pips_y_list, columns=[f'pip_{i}' for i in range(n_perc_points)])
pips_y_df_np = pips_y_df.to_numpy()

In [None]:
# plot the histogram
pips_y_df.hist(bins=100)

In [None]:
pips_y_df

In [None]:
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d 
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans

np.random.seed(5)

# X = pips_y_df_np
X = pips_y_df_np[:700]

estimators = [
    ("k_means_24", KMeans(n_clusters=24)),
    ("k_means_3", KMeans(n_clusters=3)),
    ("k_means_bad_init", KMeans(n_clusters=3, n_init=1, init="random")),
]

fig = plt.figure(figsize=(13, 10))
titles = ["24 clusters", "3 clusters", "3 clusters, bad init"]
for idx, ((name, est), title) in enumerate(zip(estimators, titles)):
    ax = fig.add_subplot(2, 2, idx + 1, projection="3d", elev=48, azim=134)
    est.fit(X)
    labels = est.labels_
    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor="k")
    ax.xaxis.set_ticklabels([])
    ax.yaxis.set_ticklabels([])
    ax.zaxis.set_ticklabels([])
    ax.set_xlabel("p0")
    ax.set_ylabel("p1")
    ax.set_zlabel("p2")
    ax.set_title(title)

# plt.subplots_adjust(wspace=0.25, hspace=0.25)
plt.show()

In [None]:
np.random.seed(5)

est = KMeans(n_clusters=24)
est.fit(pips_y_df_np)
labels = est.labels_


In [None]:
len(pips_y_df)

In [None]:
# add a new column namely k_label using labels to pips_y_df

pips_y_df['k_label'] = labels
pips_y_df