In [None]:
import pandas as pd

def load_sequences_from_csv(filename, sequence_length=80):
  # Load dataset
  data = pd.read_csv(filename)  # Replace with your dataset path

  # Fill missing values
  # data['province'] = data['province'].fillna('unknown')
  #data['city'] = data['city'].fillna('unknown')

  # Encode categorical variables
  categorical_columns = ['debit_credit']
  for col in categorical_columns:
      data[col] = LabelEncoder().fit_transform(data[col])

  # Normalize numerical variables
  scaler = MinMaxScaler()
  data['amount_cad'] = scaler.fit_transform(data[['amount_cad']])

  # Combine transaction_date and transaction_time into a single datetime feature
  if 'transaction_time' in data.columns:
    data['transaction_datetime'] = pd.to_datetime(data['transaction_date'] + ' ' + data['transaction_time'])
    data = data.sort_values(by=['customer_id', 'transaction_datetime'])
  else:
    data = data.sort_values(by=['customer_id', 'transaction_date'])

  # Drop unused columns
  columns_to_keep = ['customer_id', 'amount_cad', 'debit_credit']
  data = data[columns_to_keep]

  # Group by customer_id and create sequences
  grouped = data.groupby('customer_id')
  sequences = []
  for customer_id, group in grouped:
      group = group.drop(columns=['customer_id']).values
      for i in range(len(group) - sequence_length + 1):
          sequences.append(group[i:i+sequence_length])
  sequences = np.array(sequences)
  return sequences


In [ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

sequence_length = 75
sequences = load_sequences_from_csv("abm.csv", sequence_length)

# Train-test split
X_train, X_test = train_test_split(sequences, test_size=0.2, random_state=42)

# LSTM Autoencoder Model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, X_train.shape[2]), return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=False),
    tf.keras.layers.RepeatVector(sequence_length),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(64, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(X_train.shape[2]))
])
model.compile(optimizer='adam', loss='mse')
model.summary()

# Train the model
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")

# Suppose your model is defined as 'model'
model.save('model_lstm.h5')



  super().__init__(**kwargs)


Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 407ms/step - loss: 0.4087 - val_loss: 0.3585
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 160ms/step - loss: 0.3245 - val_loss: 0.1365
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 157ms/step - loss: 0.1505 - val_loss: 0.1216
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step - loss: 0.1323 - val_loss: 0.1371
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 151ms/step - loss: 0.1300 - val_loss: 0.0915
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 158ms/step - loss: 0.0865 - val_loss: 0.0965
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 158ms/step - loss: 0.0852 - val_loss: 0.0824
Epoch 8/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 159ms/step - loss: 0.0853 - val_loss: 0.0814
Epoch 9/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



Threshold: 0.12796705262631766
Number of anomalies detected: 4


In [15]:
sequences = load_sequences_from_csv("cheque.csv", sequence_length)
X_train, X_test = train_test_split(sequences, test_size=0.2, random_state=42)

# Train the model more
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")


Epoch 1/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 154ms/step - loss: 18161286.0000 - val_loss: 0.1121
Epoch 2/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 147ms/step - loss: 0.1121 - val_loss: 0.1120
Epoch 3/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 146ms/step - loss: 0.1120 - val_loss: 0.1119
Epoch 4/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 150ms/step - loss: 0.1115 - val_loss: 0.1105
Epoch 5/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 147ms/step - loss: 0.1103 - val_loss: 0.1097
Epoch 6/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 145ms/step - loss: 0.1099 - val_loss: 0.1096
Epoch 7/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 145ms/step - loss: 0.1096 - val_loss: 0.1093
Epoch 8/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 148ms/step - loss: 0.1091 - va