In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Input, Dense
from sklearn.metrics import mean_squared_error

In [2]:
# Load the log dataset (replace with your dataset file)
data = pd.read_csv('../dataset/log_data.csv')


In [3]:

# Convert timestamp to datetime objects
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [4]:
# Calculate time difference in seconds from the minimum timestamp
min_timestamp = data['timestamp'].min()
data['time_diff'] = (data['timestamp'] - min_timestamp).dt.total_seconds()

In [5]:
# Select relevant features for anomaly detection
# In this case, we'll use time_diff, activity, and username
features = data[['time_diff', 'activity', 'username']]

In [6]:
# Convert categorical features to numerical using label encoding
label_encoders = {}
for col in ['activity', 'username']:
    label_encoders[col] = LabelEncoder()
    features[col] = label_encoders[col].fit_transform(features[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[col] = label_encoders[col].fit_transform(features[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[col] = label_encoders[col].fit_transform(features[col])


In [7]:
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [8]:
# Split the data into train and test sets
X_train, X_test = train_test_split(scaled_features, test_size=0.2, random_state=42)


In [9]:
# ... Build and train the autoencoder model ...
input_dim = X_train.shape[1]
encoding_dim = 10  # Adjust based on your data complexity
autoencoder = Sequential()
autoencoder.add(Dense(encoding_dim, input_shape=(input_dim,), activation='relu'))
autoencoder.add(Dense(input_dim, activation='linear'))
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [10]:
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1dd67c43370>

In [11]:
# Reconstruct the data using the trained autoencoder
reconstructed_data = autoencoder.predict(scaled_features)
mse = np.mean(np.power(scaled_features - reconstructed_data, 2), axis=1)





In [12]:
# Set thresholds for anomaly detection
threshold_high = np.percentile(mse, 95)
threshold_medium = np.percentile(mse, 85)

In [13]:
# Identify anomalies
anomalies = data[mse > threshold_high]
# Ensure 'mse' has the same length as 'anomalies'
mse = mse[:len(anomalies)]

In [14]:
# Assign severity levels based on thresholds
anomalies['severity'] = np.where(mse > threshold_high, 'High',
                                np.where(mse > threshold_medium, 'Medium', 'Low'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomalies['severity'] = np.where(mse > threshold_high, 'High',


In [15]:
# Define actionable insights templates
insight_templates = {
    'High': "High Severity Anomaly Detected: {details}",
    'Medium': "Medium Severity Anomaly Detected: {details}",
    'Low': "Low Severity Anomaly Detected: {details}"
}

In [16]:
# Generate actionable insights
actionable_insights = []
for _, anomaly_row in anomalies.iterrows():
    insight_template = insight_templates.get(anomaly_row['severity'])
    if insight_template:
        actionable_insights.append(insight_template.format(
            details=f"Timestamp: {anomaly_row['timestamp']}, User: {anomaly_row['username']}, Activity: {anomaly_row['activity']}"
        ))


In [17]:
# Print actionable insights
print("Actionable Insights:")
for insight in actionable_insights:
    print(insight)

Actionable Insights:
Low Severity Anomaly Detected: Timestamp: 2023-06-04 11:32:37, User: alexis64, Activity: ProductView
Medium Severity Anomaly Detected: Timestamp: 2023-04-01 23:54:14, User: elizabeth80, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-04-16 16:18:44, User: dixonlisa, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-06-15 01:45:05, User: christinastout, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-06-08 15:45:42, User: gcastillo, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-05-17 00:11:11, User: dixongabrielle, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-07-09 20:56:42, User: blong, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-07-16 11:06:45, User: brandon42, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-05-03 18:17:46, User: beckykirby, Activity: ProductView
Low Severity Anomaly Detected: Timestamp: 2023-06-

In [18]:
# print(len(actionable_insights))
# print(type(actionable_insights))