In [1]:
import pandas as pd

# Load the data into a Pandas DataFrame
raw_data = pd.read_csv('dataset_train.csv', parse_dates=['timestamp'])

In [2]:

df=raw_data.copy()
# Compute the difference between consecutive timestamps
df['TimeDelta'] = df['timestamp'].diff()

# Find the index of the first timestamp for each train
train_start_index = df[df['TimeDelta'] > pd.Timedelta(hours=1)].index

# Compute the compressor run time and idle time for each train
T_run_list = []
T_idle_list = []
for i in range(len(train_start_index)):
    if i < len(train_start_index) - 1:
        # For trains that are not the last one
        T_run = (df.iloc[train_start_index[i]+1:train_start_index[i+1]]['COMP'] == 1).sum()
        T_idle = (df.iloc[train_start_index[i]+1:train_start_index[i+1]]['COMP'] == 0).sum()
        T_run_list.append(T_run)
        T_idle_list.append(T_idle)
    else:
        # For the last train
        T_run = (df.iloc[train_start_index[i]+1:]['COMP'] == 1).sum()
        T_idle = (df.iloc[train_start_index[i]+1:]['COMP'] == 0).sum()
        T_run_list.append(T_run)
        T_idle_list.append(T_idle)

# Add the T_run and T_idle values to the DataFrame
df.loc[train_start_index, 'T_run'] = T_run_list
df.loc[train_start_index, 'T_idle'] = T_idle_list



In [3]:
# Anomalies time intervals

anomalies_intervals = [
    (pd.Timestamp('2022-02-28 21:50'), pd.Timestamp('2022-03-01 02:00')),
    (pd.Timestamp('2022-03-23 12:54'), pd.Timestamp('2022-03-23 15:24')),
    (pd.Timestamp('2022-05-30 10:00'), pd.Timestamp('2022-06-02 06:18'))
]

In [4]:
import numpy as np
df['label'] = np.where(
    ((df['timestamp'] >= '2022-02-28 21:50') & (df['timestamp'] <= '2022-03-01 02:00')) |
    ((df['timestamp'] >= '2022-03-23 14:54') & (df['timestamp'] <= '2022-03-23 15:24')) | 
    ((df['timestamp'] >= '2022-05-30 12:00') & (df['timestamp'] <= '2022-06-02 06:18')) ,
    1, 0)

df=df.drop(['gpsLong','gpsLat','gpsSpeed','gpsQuality'], axis=1)

In [5]:
# Extract all anomalies
normal_data = df[df['label'] == 0]

anomaly_data_df = df[df['label'] == 1]

# Extract some normal data
extra_normal_data_df = df[(df['label'] == 0) & 
                    ((df['timestamp'] >= '2022-02-25') & (df['timestamp'] < '2022-02-28 21:50') |
                     (df['timestamp'] >= '2022-03-20 02:00') & (df['timestamp'] < '2022-03-23 14:54'))]       

X_true = pd.concat([anomaly_data_df, extra_normal_data_df], axis=0)           
y_true = X_true['label'].values
X_predict = X_true.drop('label', axis=1)
print(X_predict.shape)

(740534, 20)


In [6]:


df= df.drop('label', axis=1)
print(df.shape)


(10773588, 20)


In [7]:
import numpy as np
import pandas as pd

def segment_intervals(times, n_segments):
    return [np.linspace(0, interval, n_segments + 1, endpoint=True, dtype=int) for interval in times]

def compute_mean_and_multiply(data, intervals, cycle_duration):
    mean_values = []

    for interval in intervals:
        mean_interval = np.mean(data[interval[0]:interval[-1]])
        mean_values.append(mean_interval * cycle_duration)

    return mean_values



In [8]:
def extract_features_analog(df):
    # Preallocate the feature array
    num_bins = 7
    num_analog_sensors = 8
    num_features_per_sensor = num_bins + 2  # num_bins plus T_run and T_idle
    features = np.zeros((len(df), num_analog_sensors * num_features_per_sensor))

    # Calculate bins for each analog sensor
    for sensor_idx in range(num_analog_sensors):
        sensor_data = df.iloc[:, sensor_idx + 1]  # Skip the timestamp column

        for idx, (T_run, T_idle) in enumerate(zip(df['T_run'], df['T_idle'])):
            # Check for NaN values in T_run and T_idle
            if pd.isna(T_run) or pd.isna(T_idle):
                continue

            cycle_duration = T_run + T_idle
            T_run_bins = np.array_split(sensor_data[:int(T_run)], 2)
            T_idle_bins = np.array_split(sensor_data[int(T_run):int(cycle_duration)], 5)

            # Calculate the mean values of each bin
            feature_idx = sensor_idx * num_features_per_sensor
            features[idx, feature_idx:feature_idx + 2] = [np.mean(bin) for bin in T_run_bins]
            features[idx, feature_idx + 2:feature_idx + 7] = [np.mean(bin) for bin in T_idle_bins]

            # Multiply the mean value by the cycle duration
            features[idx, feature_idx:feature_idx + 7] *= cycle_duration

            # Add the T_run and T_idle values to the features
            features[idx, feature_idx + 7] = T_run
            features[idx, feature_idx + 8] = T_idle

    return features


In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
import datetime




# Set the initial training and testing time windows
start_date_train = pd.Timestamp('2022-01-01')
end_date_train = pd.Timestamp('2022-02-01')
start_date_test = end_date_train 
end_date_test = start_date_test + datetime.timedelta(weeks=1)


# Set other parameters (num_features, num_epochs, batch_size, etc.) as before
num_features = 72
num_epochs = 5
batch_size = 30
beta = 5
lamda = 1e-5
rho = 0.01
alpha = 0.04



In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, UpSampling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
import datetime

In [None]:

# Set other parameters
num_features = 72
num_epochs = 50
batch_size = 30
beta = 5
lamda = 1e-5
rho = 0.01
alpha = 0.04
num_analog_sensors=8
num_features_per_sensor=9
num_input_features = num_analog_sensors * num_features_per_sensor

input_layer = Input(shape=(num_input_features,1), name='input_layer')

# Encoder layers
encoder = Conv1D(16, kernel_size=3, activation='relu', padding='same')(input_layer)
encoder = MaxPooling1D(pool_size=2, padding='same')(encoder)
encoder = Conv1D(8, kernel_size=3, activation='relu', padding='same')(encoder)
encoder = MaxPooling1D(pool_size=2, padding='same')(encoder)
encoder = Conv1D(8, kernel_size=3, activation='relu', padding='same')(encoder)
bottleneck = MaxPooling1D(pool_size=2, padding='same', name='bottleneck')(encoder)

# Decoder layers
decoder = Conv1D(8, kernel_size=3, activation='relu', padding='same')(bottleneck)
decoder = UpSampling1D(size=2)(decoder)
decoder = Conv1D(8, kernel_size=3, activation='relu', padding='same')(decoder)
decoder = UpSampling1D(size=2)(decoder)
decoder = Conv1D(16, kernel_size=3, activation='relu', padding='same')(decoder)
decoder = UpSampling1D(size=2)(decoder)
output_layer = Conv1D(1, kernel_size=3, activation='sigmoid', padding='same', name='output_layer')(decoder)

sae = Model(input_layer, output_layer)
sae.compile(optimizer=Adam(), loss='mean_squared_error')
detected_anomalies_indexes=[]
while end_date_test <= pd.Timestamp('2022-06-02'):
    # Filter out anomaly intervals from the training dataset
    mask = df['timestamp'].between(start_date_train, end_date_train) & \
           ~np.any([(df['timestamp'].between(start, end)) for start, end in anomalies_intervals], axis=0)
    
    train_df = df[mask]
    test_df = df[df['timestamp'].between(start_date_test, end_date_test)]

    # Extract features for the analog signals
    X_train = extract_features_analog(train_df)
    X_test = extract_features_analog(test_df)
    X_train_reshaped = X_train.reshape(-1, num_features)
    X_test_reshaped = X_test.reshape(-1, num_features)


    # Normalize the input data
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    # Train the SAE model
    sae.fit(X_train_scaled, X_train_scaled, epochs=num_epochs, batch_size=batch_size, shuffle=True)
    X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0] ,num_features, 1)


    #Predict outputs for training data with the SAE
    X_train_pred = sae.predict(X_train_scaled)

    # Compute reconstruction error for training data
    er_train = np.abs(X_train_scaled - X_train_pred)

    # Determine threshold using the boxplot on er_train
    Q1 = np.percentile(er_train, 25)
    Q3 = np.percentile(er_train, 75)
    IQR = Q3 - Q1
    threshold = Q3 + 1.5 * IQR

    # Predict outputs for test data with the SAE
    X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0] ,num_features, 1)
    X_test_pred = sae.predict(X_test_scaled)


    # Compute reconstruction error for test data
    er_test = np.abs(X_test_scaled - X_test_pred)

    # Apply the Low Pass Filter (LPF) to er_test
    er_filtered = alpha * er_test[:-1] + (1 - alpha) * er_test[1:]

    # Compute er_thresholding by comparing the filtered er_test with the threshold
    er_thresholding = er_filtered > threshold
     # Output: Detected anomalies on test data
    anomalies = np.where(er_thresholding)
    print("Detected anomalies on test data:", anomalies)
     # Append the detected anomalies' indexes
    anomalies_indexes = np.where(er_thresholding)[0] + test_df.index[0]
    detected_anomalies_indexes.extend(anomalies_indexes.tolist())

    # Remove detected anomalies from the original DataFrame
    df = df.drop(anomalies_indexes, axis=0)

    # Move the time window one week forward
    start_date_train += datetime.timedelta(weeks=1)
    end_date_train += datetime.timedelta(weeks=1)
    start_date_test += datetime.timedelta(weeks=1)
    end_date_test += datetime.timedelta(weeks=1)


In [None]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix


X_pred_raw = extract_features_analog(X_predict)  # Replace new_data with the actual new data you want to predict anomalies for
X_pred = scaler.transform(X_pred_raw)
X_pred = X_pred.reshape(X_pred.shape[0] ,num_features, 1)


# Predict outputs for the new data with the SAE
X_pred_outputs = sae.predict(X_pred)
X_pred_outputs.shape




In [None]:

# Compute reconstruction error for X_pred
er_pred = np.abs(X_pred - X_pred_outputs)

# Apply the Low Pass Filter (LPF) to er_pred
er_pred_filtered = alpha * er_pred[:-1] + (1 - alpha) * er_pred[1:]

# Compute er_thresholding by comparing the filtered er_pred with the threshold
er_pred_thresholding = er_pred_filtered > threshold

# Assign anomaly labels (1 for anomaly, 0 for normal) based on er_thresholding
y_pred = er_pred_thresholding.astype(int)

In [None]:
y_true = y_true[:-1]
y_true_binary = np.zeros_like(X_pred)

# Set the anomaly rows to 1
y_true_binary[:anomaly_data_df.shape[0],:,:] = 1
print(y_true_binary.shape)

y_true_binary.shape

In [None]:
# Calculate accuracy

# Reshape y_pred and y_true_binary to 1D arrays
y_pred_flat = np.ravel(y_pred)
y_true_binary_flat = np.ravel(y_true_binary)

# Calculate accuracy
accuracy = accuracy_score(y_true_binary_flat, y_pred_flat)
print("Accuracy:", accuracy)

# Compute classification report (precision, recall, and F1 score)
report = classification_report(y_true_binary_flat, y_pred_flat)
print("Classification report:")
print(report)