In [None]:
import pandas as pd

def load_sequences_from_csv(filename, sequence_length=10):
  # Load dataset
  data = pd.read_csv(filename)  # Replace with your dataset path

  # Fill missing values
  # data['province'] = data['province'].fillna('unknown')
  #data['city'] = data['city'].fillna('unknown')

  # Encode categorical variables
  categorical_columns = ['debit_credit']
  for col in categorical_columns:
      data[col] = LabelEncoder().fit_transform(data[col])

  # Normalize numerical variables
  scaler = MinMaxScaler()
  data['amount_cad'] = scaler.fit_transform(data[['amount_cad']])

  # Combine transaction_date and transaction_time into a single datetime feature
  if 'transaction_time' in data.columns:
    data['transaction_datetime'] = pd.to_datetime(data['transaction_date'] + ' ' + data['transaction_time'])
    data = data.sort_values(by=['customer_id', 'transaction_datetime'])
  else:
    data = data.sort_values(by=['customer_id', 'transaction_date'])

  # Drop unused columns
  columns_to_keep = ['customer_id', 'amount_cad', 'debit_credit']
  data = data[columns_to_keep]

  # Group by customer_id and create sequences
  grouped = data.groupby('customer_id')
  sequences = []
  for customer_id, group in grouped:
      group = group.drop(columns=['customer_id']).values
      for i in range(len(group) - sequence_length + 1):
          sequences.append(group[i:i+sequence_length])
  sequences = np.array(sequences)
  return sequences


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

sequence_length = 10
sequences = load_sequences_from_csv("abm.csv", sequence_length)

# Train-test split
X_train, X_test = train_test_split(sequences, test_size=0.2, random_state=42)

# LSTM Autoencoder Model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, X_train.shape[2]), return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=False),
    tf.keras.layers.RepeatVector(sequence_length),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(64, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(X_train.shape[2]))
])
model.compile(optimizer='adam', loss='mse')
model.summary()

# Train the model
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")


  super().__init__(**kwargs)


[[[7.04247683e-04 1.00000000e+00]
  [3.98995739e-03 1.00000000e+00]
  [9.93139829e-03 1.00000000e+00]
  ...
  [8.14482228e-03 1.00000000e+00]
  [1.00365350e-02 1.00000000e+00]
  [1.87675600e-02 0.00000000e+00]]

 [[7.67516793e-03 0.00000000e+00]
  [9.92438294e-04 1.00000000e+00]
  [2.08496226e-04 1.00000000e+00]
  ...
  [1.70903299e-03 1.00000000e+00]
  [5.95247839e-03 1.00000000e+00]
  [9.58932978e-03 1.00000000e+00]]

 [[9.83271569e-03 1.00000000e+00]
  [7.97130927e-04 1.00000000e+00]
  [4.94535463e-03 1.00000000e+00]
  ...
  [6.90310519e-03 1.00000000e+00]
  [4.94582232e-03 1.00000000e+00]
  [4.90242068e-03 1.00000000e+00]]

 ...

 [[1.89095507e-02 0.00000000e+00]
  [9.94945112e-03 1.00000000e+00]
  [4.03897130e-03 1.00000000e+00]
  ...
  [4.78727539e-04 1.00000000e+00]
  [4.84162097e-03 0.00000000e+00]
  [3.00135415e-03 1.00000000e+00]]

 [[9.66958540e-03 1.00000000e+00]
  [4.80027717e-03 1.00000000e+00]
  [3.84721838e-04 1.00000000e+00]
  ...
  [3.94262715e-04 1.00000000e+00]
  [2

In [None]:
sequences = load_sequences_from_csv("cheque.csv")
X_train, X_test = train_test_split(sequences, test_size=0.2, random_state=42)

# Train the model more
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")


Epoch 1/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 18ms/step - loss: 4.9758e-04 - val_loss: 4.6399e-04
Epoch 2/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 18ms/step - loss: 9.5539e-04 - val_loss: 2.3122e-04
Epoch 3/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 18ms/step - loss: 3.6410e-04 - val_loss: 1.6483e-04
Epoch 4/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 18ms/step - loss: 6.1018e-04 - val_loss: 1.8335e-04
Epoch 5/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 19ms/step - loss: 4.4134e-04 - val_loss: 0.0015
Epoch 6/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 18ms/step - loss: 7.0897e-04 - val_loss: 1.4931e-04
Epoch 7/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 18ms/step - loss: 2.0021e-04 - val_loss: 2.1210e-04
Epoch 8/20
[1m4086/4086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 