In [29]:
import pandas as pd
import numpy as np
import seaborn as sns

In [30]:
df = pd.read_csv('modified_UCI.csv')

In [None]:
df.head

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Print the result
print("Total missing values in each column:")
print(missing_values)

In [None]:
pip install tensorflow

In [None]:
# LSTM
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


df = pd.read_csv('modified_UCI.csv')

# Extract relevant columns (date, time, AH)
time_series_data = df[['Date', 'Time', 'AH']]


time_series_data['datetime'] = pd.to_datetime(time_series_data['Date'] + ' ' + time_series_data['Time'])
time_series_data.set_index('datetime', inplace=True)

# Drop the individual date and time columns
time_series_data.drop(['Date', 'Time'], axis=1, inplace=True)

# Normalize values to 0,1
scaler = MinMaxScaler()
time_series_data['normalized'] = scaler.fit_transform(time_series_data[['AH']])

# Function to create sequences for time-series data
def create_sequences(data, sequence_length):
    sequences, targets = [], []
    for i in range(len(data) - sequence_length):
        seq = data.iloc[i:i + sequence_length]
        label = data.iloc[i + sequence_length]['normalized']
        sequences.append(seq.values)
        targets.append(label)
    return np.array(sequences), np.array(targets)

# create sequences and define length
sequence_length = 10
sequences, targets = create_sequences(time_series_data, sequence_length)


split = int(0.8 * len(sequences))
X_train, y_train = sequences[:split], targets[:split]
X_test, y_test = sequences[split:], targets[split:]

# RNN model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Predict on the test set
predictions = model.predict(X_test)

#  Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error (MSE): {mse}')



In [None]:
# Autoencoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



# Select features (X)
X = df[['C6H6(GT)', 'CO(GT)', 'NOx(GT)', 'NO2(GT)', 'NOx(GT)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=32)

# autoencoder model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=X_train.shape[1], activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
model.fit(X_train, X_train, epochs=20, batch_size=22, validation_split=0.2, verbose=1)


test_predictions = model.predict(X_test)

# Calculate the reconstruction error on the test set
test_mse = mean_squared_error(X_test, test_predictions)
print(f'Test Mean Squared Error: {test_mse}')



threshold = 0.000003


anomalies = test_mse > threshold




In [None]:
# DBSCAN
from sklearn.cluster import DBSCAN
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report



features = df[['C6H6(GT)', 'CO(GT)', 'NOx(GT)', 'NO2(GT)']]


scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


threshold = 0.05
df['anomaly'] = (features_scaled.mean(axis=1) > threshold).astype(int)


dbscan_model = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan_model.fit_predict(features_scaled)


df['dbscan_label'] = dbscan_labels
df['is_anomaly_dbscan'] = (df['dbscan_label'] == -1).astype(int)

# Evaluation
print(classification_report(df['anomaly'], df['is_anomaly_dbscan']))

# indices of anomalies detected by DBSCAN
anomaly_indices_dbscan = df.index[df['is_anomaly_dbscan'] == 1]
print("Anomaly Indices (DBSCAN):", anomaly_indices_dbscan)


In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report



features = df[['C6H6(GT)', 'CO(GT)', 'NOx(GT)', 'NO2(GT)']]

# Standardization
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


threshold = 0.03
df['anomaly'] = (features_scaled.mean(axis=1) > threshold).astype(int)


X_train, X_test, y_train, y_test = train_test_split(features_scaled, df['anomaly'], test_size=0.2, random_state=42)

# Create and fit the k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


anomaly_indices = df.index[df['anomaly'] == 1]
print("Anomaly Indices:", anomaly_indices)
