In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix


In [53]:
# Load the labeled data produced by your friend's algorithm
df = pd.read_csv("patterns.csv")

# Ensure date is a datetime (sorting is optional if you are sure it's already sorted)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")  # harmless even if already sorted

# Clean up the pattern column (replace missing with 'no-pattern', ensure string type)
df["pattern"] = df["pattern"].fillna("no-pattern").astype(str)

# Just to understand the distribution of labels (not used for grouping)
pattern_counts = df["pattern"].value_counts()
print(pattern_counts)


pattern
no-pattern       11909
BearButterfly      910
BearShark          782
BullBat            734
BullButterfly      703
BearCrab           569
BullShark          544
BullGartley        430
BullCrab           310
BullCypher         254
BearGartley        197
BearCypher         131
BearBat             23
Name: count, dtype: int64


In [54]:
# Create a shifted version of pattern to compare with previous row
df["pattern_shift"] = df["pattern"].shift(1)

# Whenever pattern changes from previous row, we start a new group
df["new_group"] = (df["pattern"] != df["pattern_shift"]).astype(int)

# Cumulative sum of new_group gives us a unique group id
df["group_id"] = df["new_group"].cumsum()

# Let's see how many groups there are
num_groups = df["group_id"].nunique()
print("Number of groups (pattern segments):", num_groups)


Number of groups (pattern segments): 201


In [55]:
FEATURE_COLS = ["price", "close", "high", "low", "open", "r"]
SEQ_LEN = 64

X_sequences = []
y_labels = []

for gid, group in df.groupby("group_id"):
    label = group["pattern"].iloc[0]
    
    # Extract selected features
    feat = group[FEATURE_COLS].values  # shape: (group_len, num_features)

    # Handle missing r if there are any:
    # (optional) you can fill NaN in r with 0 before this loop:
    # df["r"] = df["r"].fillna(0)

    # Make fixed-length sequence
    if len(feat) >= SEQ_LEN:
        # take last 64 rows of this group
        seq = feat[-SEQ_LEN:]
    else:
        # pad at the beginning by repeating the first row
        pad_len = SEQ_LEN - len(feat)
        pad = np.repeat(feat[0:1, :], pad_len, axis=0)
        seq = np.concatenate([pad, feat], axis=0)

    X_sequences.append(seq)
    y_labels.append(label)

X = np.stack(X_sequences)  # shape: (num_groups, SEQ_LEN, num_features)
y = np.array(y_labels)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (201, 64, 6)
y shape: (201,)


In [56]:
# 1. Drop "no-pattern" sequences
mask = y != "no-pattern"
X_hp = X[mask]      # harmonic pattern sequences only
y_hp = y[mask]      # corresponding labels

print("X_hp shape:", X_hp.shape)
print("y_hp shape:", y_hp.shape)

# 2. Label encode the harmonic pattern names
le = LabelEncoder()
y_encoded = le.fit_transform(y_hp)

print("Classes:", le.classes_)
print("Encoded labels example:", y_encoded[:10])
num_classes = len(le.classes_)


X_hp shape: (116, 64, 6)
y_hp shape: (116,)
Classes: ['BearBat' 'BearButterfly' 'BearCrab' 'BearCypher' 'BearGartley'
 'BearShark' 'BullBat' 'BullButterfly' 'BullCrab' 'BullCypher'
 'BullGartley' 'BullShark']
Encoded labels example: [ 7  2  2  2  7 10  3  8  7  4]


In [57]:
print("Label counts at SEQUENCE level:")
print(Counter(y_hp))

print("\nEncoded label counts at SEQUENCE level:")
print(Counter(y_encoded))


Label counts at SEQUENCE level:
Counter({'BullButterfly': 18, 'BearShark': 17, 'BullBat': 16, 'BearButterfly': 15, 'BullGartley': 10, 'BearCrab': 9, 'BullShark': 9, 'BullCrab': 8, 'BearGartley': 6, 'BearCypher': 4, 'BullCypher': 3, 'BearBat': 1})

Encoded label counts at SEQUENCE level:
Counter({7: 18, 5: 17, 6: 16, 1: 15, 10: 10, 2: 9, 11: 9, 8: 8, 4: 6, 3: 4, 9: 3, 0: 1})


In [58]:
# Since there is only 1 sequence of BearBat, we cannot use stratify this time. So we will skip it for now and use it when we try Oversampling
X_train, X_temp, y_train, y_temp = train_test_split(
    X_hp, y_encoded,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)


Train: (81, 64, 6) (81,)
Val:   (17, 64, 6) (17,)
Test:  (18, 64, 6) (18,)


In [59]:
# Flatten sequences so RandomOverSampler can work: (samples, features_flat)
nsamples, seqlen, nfeat = X_train.shape
X_train_flat = X_train.reshape(nsamples, seqlen * nfeat)

ros = RandomOverSampler(random_state=42)
X_train_res_flat, y_train_res = ros.fit_resample(X_train_flat, y_train)

# Reshape back to (samples, timesteps, features)
X_train_res = X_train_res_flat.reshape(-1, seqlen, nfeat)

print("Original train shape:", X_train.shape, y_train.shape)
print("Resampled train shape:", X_train_res.shape, y_train_res.shape)


Original train shape: (81, 64, 6) (81,)
Resampled train shape: (165, 64, 6) (165,)


In [60]:
#standartization step

# X_train, X_val, X_test currently have shape: (samples, 64, 6)
num_features = X_train.shape[2]
SEQ_LEN = X_train.shape[1]

scaler = StandardScaler()

# Fit scaler on TRAIN ONLY (important! no data leakage)
X_train_2d = X_train.reshape(-1, num_features)  # (81*64, 6)
scaler.fit(X_train_2d)

# Transform train/val/test
def transform_with_scaler(X, scaler):
    s, t, f = X.shape  # samples, timesteps, features
    X_2d = X.reshape(-1, f)
    X_scaled_2d = scaler.transform(X_2d)
    return X_scaled_2d.reshape(s, t, f)

X_train_scaled = transform_with_scaler(X_train, scaler)
X_val_scaled   = transform_with_scaler(X_val, scaler)
X_test_scaled  = transform_with_scaler(X_test, scaler)


In [61]:
#Keeping the LSTM simple because our dataset is small

num_classes = len(le.classes_)   # should be 12

model = Sequential([
    Input(shape=(SEQ_LEN, num_features)),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [62]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.0864 - loss: 2.5268 - val_accuracy: 0.1176 - val_loss: 2.4412
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.1605 - loss: 2.4449 - val_accuracy: 0.2941 - val_loss: 2.4275
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.1728 - loss: 2.4188 - val_accuracy: 0.2353 - val_loss: 2.4146
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1605 - loss: 2.3882 - val_accuracy: 0.2353 - val_loss: 2.4093
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.2099 - loss: 2.3600 - val_accuracy: 0.2941 - val_loss: 2.4069
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.2222 - loss: 2.3328 - val_accuracy: 0.2353 - val_loss: 2.3995
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━

In [63]:
y_pred_probs = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_probs, axis=1)

labels = np.arange(num_classes)  # [0, 1, ..., 11]

print("Classification report:")
print(classification_report(
    y_test,
    y_pred,
    labels=labels,
    target_names=le.classes_,
    zero_division=0   # avoid warnings for classes with no samples
))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred, labels=labels))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
Classification report:
               precision    recall  f1-score   support

      BearBat       0.00      0.00      0.00         1
BearButterfly       0.00      0.00      0.00         5
     BearCrab       0.00      0.00      0.00         2
   BearCypher       0.00      0.00      0.00         0
  BearGartley       0.00      0.00      0.00         0
    BearShark       0.10      0.50      0.17         2
      BullBat       0.00      0.00      0.00         0
BullButterfly       0.00      0.00      0.00         4
     BullCrab       0.00      0.00      0.00         2
   BullCypher       0.00      0.00      0.00         0
  BullGartley       0.00      0.00      0.00         2
    BullShark       0.00      0.00      0.00         0

     accuracy                           0.06        18
    macro avg       0.01      0.04      0.01        18
 weighted avg       0.01      0.06      0.02        18

Confusion matrix:
[[0