In [1]:
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RepeatVector, TimeDistributed
from tensorflow.keras.models import load_model
import yfinance as yf

In [2]:
# Example usage
folder_path = r'C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset'

In [3]:
# Function to preprocess data and train the model
def train_model(file_path):
    df = pd.read_csv(file_path)
    df = df[['Date', 'Close', 'anomaly']]  # Assuming 'anomaly' is the column indicating manipulation
    df = df.sort_values(by='Date')
    model_path = 'my_model_v2.h5' 
# Calculate the index for the dividing date
    train_size = int(0.6 * len(df))
    dividing_date = df['Date'].iloc[train_size]

#print("Dividing date:", dividing_date)
    train = df.loc[df['Date'] <= dividing_date ]
    test = df.loc[df['Date'] > dividing_date ]
    scaler = StandardScaler()
    scaler = scaler.fit(np.array(train['Close']).reshape(-1,1))

    train['Close'] = scaler.transform(np.array(train['Close']).reshape(-1,1))
    test['Close'] = scaler.transform(np.array(test['Close']).reshape(-1,1))
    
    TIME_STEPS=30

    def create_sequences(X, y, time_steps=TIME_STEPS):
        X_out, y_out = [], []
        for i in range(len(X)-time_steps):
            X_out.append(X.iloc[i:(i+time_steps)].values)
            y_out.append(y.iloc[i+time_steps])
    
        return np.array(X_out), np.array(y_out)
    X_train, y_train = create_sequences(train[['Close']], train['Close'])
    X_test, y_test = create_sequences(test[['Close']], test['Close'])
# set seed to regenerate same sequence of random numbers. 
    np.random.seed(21)
    tf.random.set_seed(21)
    model=load_model(model_path)

    # Mean Absolute Error loss
    X_train_pred = model.predict(X_train)
    train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

#     plt.hist(train_mae_loss, bins=50)
#     plt.xlabel('Train MAE loss')
#     plt.ylabel('Number of Samples')

    # Set reconstruction error threshold
    threshold = np.max(train_mae_loss)
    threshold=0.8*threshold

#     print('Reconstruction error threshold:', threshold)

    X_test_pred = model.predict(X_test, verbose=1)
    test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1)


    anomaly_df = pd.DataFrame(test[TIME_STEPS:])
    anomaly_df['loss'] = test_mae_loss
    anomaly_df['threshold'] = threshold
    anomaly_df['anomaly'] = anomaly_df['loss'] > anomaly_df['threshold']

    # Compare model-detected anomalies with ground truth
    anomaly_df['ground_truth'] =test['anomaly']

    # Calculate accuracy for each file
    accuracy = np.mean(anomaly_df['anomaly'] == anomaly_df['ground_truth'])
    print(f"Accuracy for {file_path}: {accuracy * 100:.2f}%")

    return accuracy


In [6]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

def evaluate_models(folder_path):
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    overall_accuracy = 0.0

    for file in file_list:
        file_path = os.path.join(folder_path, file)
        print(f"\nProcessing file: {file}")
        file_accuracy = train_model(file_path)
        overall_accuracy += file_accuracy

    overall_accuracy /= len(file_list)
    print(f"\nOverall Accuracy: {overall_accuracy * 100:.2f}%")

# Example usage
folder_path = r'C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset'
evaluate_models(folder_path)


Processing file: 7nr_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\7nr_upd.csv: 64.52%

Processing file: beeyu_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\beeyu_upd.csv: 13.74%

Processing file: dar_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\dar_upd.csv: 100.00%

Processing file: devk_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\devk_upd.csv: 89.55%

Processing file: gbr_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\gbr_upd.csv: 100.00%

Processing file: gsl_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\gsl_upd.csv: 64.52%

Processing file: haria_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\haria_upd.csv: 100.00%

Processing file: khy_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset\khy_upd.csv: 64.52%

Processing file: lic_upd.csv
Accuracy for C:\Users\AKHIL\OneDrive\Desktop\lstm_gan\dataset