In [3]:
import pandas as pd
import numpy as np

def load_sequences_from_csv(filename, sequence_length=80):
  # Load dataset
  data = pd.read_csv(filename)  # Replace with your dataset path

  # Fill missing values
  # data['province'] = data['province'].fillna('unknown')
  #data['city'] = data['city'].fillna('unknown')

  # Encode categorical variables
  categorical_columns = ['debit_credit']
  for col in categorical_columns:
      data[col] = LabelEncoder().fit_transform(data[col])

  # Normalize numerical variables
  scaler = MinMaxScaler()
  data['amount_cad'] = scaler.fit_transform(data[['amount_cad']])

  # Combine transaction_date and transaction_time into a single datetime feature
  if 'transaction_time' in data.columns:
    data['transaction_datetime'] = pd.to_datetime(data['transaction_date'] + ' ' + data['transaction_time'])
    data = data.sort_values(by=['customer_id', 'transaction_datetime'])
  else:
    data = data.sort_values(by=['customer_id', 'transaction_date'])

  # Drop unused columns
  columns_to_keep = ['customer_id', 'amount_cad', 'debit_credit']
  data = data[columns_to_keep]

  # Group by customer_id and create sequences
  grouped = data.groupby('customer_id')
  sequences = []
  for customer_id, group in grouped:
      group = group.drop(columns=['customer_id']).values
      for i in range(len(group) - sequence_length + 1):
          sequences.append(group[i:i+sequence_length])
  return sequences


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

sequence_length = 75
sequences = load_sequences_from_csv("abm.csv", sequence_length)

# Train-test split
X_train, X_test = train_test_split(np.array(sequences), test_size=0.2, random_state=42)

# LSTM Autoencoder Model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, 2), return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=False),
    tf.keras.layers.RepeatVector(sequence_length),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(64, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(2))
])
model.compile(optimizer='adam', loss='mse')
model.summary()

# # Train the model
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# # Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")

# Suppose your model is defined as 'model'
model.save('model_lstm.h5')



  super().__init__(**kwargs)


Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 357ms/step - loss: 0.3963 - val_loss: 0.2745
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 132ms/step - loss: 0.2139 - val_loss: 0.0888
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 130ms/step - loss: 0.1008 - val_loss: 0.1312
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 130ms/step - loss: 0.1260 - val_loss: 0.0948
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 129ms/step - loss: 0.0879 - val_loss: 0.0927
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 131ms/step - loss: 0.0883 - val_loss: 0.0813
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 129ms/step - loss: 0.0814 - val_loss: 0.0800
Epoch 8/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 129ms/step - loss: 0.0811 - val_loss: 0.0798
Epoch 9/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



Threshold: 0.12721845371860338
Number of anomalies detected: 4


In [42]:
import glob
from sklearn.model_selection import train_test_split

# Load all CSV files in the local directory
csv_files = ["cheque.csv", "eft.csv", "emt.csv","card.csv", "wire.csv"]

for file in csv_files:
  print(file)
  sequences = load_sequences_from_csv(file, sequence_length)

  # Train/test split
  X_train, X_test = train_test_split(np.array(sequences), test_size=0.2, random_state=42)

  # Train the model
  history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

  # Reconstruction errors
  X_test_pred = model.predict(X_test)
  test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

  # Anomaly detection threshold
  threshold = np.percentile(test_loss, 95)  # Adjust as needed
  anomalies = test_loss > threshold

  model.save('model_lstm.h5')

cheque.csv
Epoch 1/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 118ms/step - loss: nan - val_loss: nan
Epoch 2/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 118ms/step - loss: nan - val_loss: nan
Epoch 3/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 118ms/step - loss: nan - val_loss: nan
Epoch 4/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 5/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 6/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 7/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 8/20
[1m1737/1737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 9/20
[1m1737/1737[0m 



eft.csv
Epoch 1/20
[1m4865/4865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 119ms/step - loss: nan - val_loss: nan
Epoch 2/20
[1m4865/4865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m567s[0m 117ms/step - loss: nan - val_loss: nan
Epoch 3/20
[1m4865/4865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 118ms/step - loss: nan - val_loss: nan
Epoch 4/20
[1m2168/4865[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m5:01[0m 112ms/step - loss: nanBuffered data was truncated after reaching the output size limit.