In [1]:
!pip install numpy pandas matplotlib scikit-learn tensorflow




In [5]:
import pandas as pd

data = pd.read_csv('/content/data.csv', encoding='latin1')  # or ISO-8859-1
print(data.columns)  # Show available column names


Index(['stn_code', 'sampling_date', 'state', 'location', 'agency', 'type',
       'so2', 'no2', 'rspm', 'spm', 'location_monitoring_station', 'pm2_5',
       'date'],
      dtype='object')


  data = pd.read_csv('/content/data.csv', encoding='latin1')  # or ISO-8859-1


In [7]:
print("Original dataset shape:", data.shape)
data = data.dropna()
print("After dropping NaN:", data.shape)  # Should not be (0, X)


Original dataset shape: (0, 14)
After dropping NaN: (0, 14)


In [8]:
import pandas as pd

data = pd.read_csv('/content/data.csv', encoding='latin1', low_memory=False)
print(data.head())  # Show first 5 rows
print("Dataset shape:", data.shape)


  stn_code       sampling_date           state   location agency  \
0      150  February - M021990  Andhra Pradesh  Hyderabad    NaN   
1      151  February - M021990  Andhra Pradesh  Hyderabad    NaN   
2      152  February - M021990  Andhra Pradesh  Hyderabad    NaN   
3      150     March - M031990  Andhra Pradesh  Hyderabad    NaN   
4      151     March - M031990  Andhra Pradesh  Hyderabad    NaN   

                                 type  so2   no2  rspm  spm  \
0  Residential, Rural and other Areas  4.8  17.4   NaN  NaN   
1                     Industrial Area  3.1   7.0   NaN  NaN   
2  Residential, Rural and other Areas  6.2  28.5   NaN  NaN   
3  Residential, Rural and other Areas  6.3  14.7   NaN  NaN   
4                     Industrial Area  4.7   7.5   NaN  NaN   

  location_monitoring_station  pm2_5        date  
0                         NaN    NaN  1990-02-01  
1                         NaN    NaN  1990-02-01  
2                         NaN    NaN  1990-02-01  
3       

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset with correct encoding
data = pd.read_csv('/content/data.csv', encoding='latin1', low_memory=False)

# Convert numeric columns safely
num_cols = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
for col in num_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert to numbers

# Fill missing values using the median of each column
for col in num_cols:
    data[col] = data[col].fillna(data[col].median())

# Define anomaly labels (Top 5% of PM2.5 as anomalies)
threshold = data['pm2_5'].quantile(0.95)
data['target'] = (data['pm2_5'] > threshold).astype(int)

# Select Features (Dropping non-relevant columns)
X = data[num_cols]  # Only numeric features
y = data['target']

# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Confirm shape
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")


Train set: (348593, 5), Test set: (87149, 5)


In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define Autoencoder Model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),  # Bottleneck (compressed representation)
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(X_train.shape[1], activation='sigmoid')  # Output same shape as input
])

model.compile(optimizer='adam', loss='mse')

# Train the model (unsupervised training)
history = model.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))


Epoch 1/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - loss: 0.6518 - val_loss: 0.7176
Epoch 2/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2ms/step - loss: 0.6722 - val_loss: 0.7172
Epoch 3/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - loss: 0.6794 - val_loss: 0.7172
Epoch 4/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - loss: 0.6892 - val_loss: 0.7172
Epoch 5/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3ms/step - loss: 0.6778 - val_loss: 0.7173
Epoch 6/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3ms/step - loss: 0.6752 - val_loss: 0.7172
Epoch 7/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3ms/step - loss: 0.6563 - val_loss: 0.7171
Epoch 8/50
[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3ms/step - loss: 0.6494 - val_loss: 0.7171


In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict reconstruction error
train_pred = model.predict(X_train)
train_mse = np.mean(np.power(X_train - train_pred, 2), axis=1)

test_pred = model.predict(X_test)
test_mse = np.mean(np.power(X_test - test_pred, 2), axis=1)

# Set anomaly detection threshold (95th percentile)
threshold = np.percentile(train_mse, 95)

# Classify anomalies
y_test_pred = (test_mse > threshold).astype(int)

# Evaluate model
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Model Performance:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-score: {f1}")


[1m10894/10894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step
[1m2724/2724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Model Performance:
Accuracy: 0.9483757702325901
Precision: 0.09948580371115583
Recall: 0.48580786026200873
F1-score: 0.16515123399517537


In [13]:
import joblib

import keras.saving
keras.saving.save_model(model, 'anomaly_model.keras')
joblib.dump(scaler, 'scaler.pkl')

# Generate performance report
report = f"""
Anomaly Detection Model Performance:
------------------------------------
Accuracy: {accuracy}
Precision: {precision}
Recall: {recall}
F1-score: {f1}
"""

with open("report.txt", "w") as file:
    file.write(report)

print("✅ Model saved & report generated!")


✅ Model saved & report generated!


In [14]:
from tensorflow import keras

# Load the saved model
model = keras.models.load_model('anomaly_model.keras')

# Use it for predictions
predictions = model.predict(X_test)

[1m  46/2724[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 1ms/step   

  saveable.load_own_variables(weights_store.get(inner_path))


[1m2724/2724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


In [15]:
from google.colab import files

# List of files to download
files_to_download = ['anomaly_model.keras', 'scaler.pkl', 'report.txt']

# Download each file
for file in files_to_download:
    files.download(file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
import tensorflow as tf

# Load your trained model
model = tf.keras.models.load_model('anomaly_model.keras')

# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the .tflite model
with open("anomaly_model.tflite", "wb") as f:
    f.write(tflite_model)

print("✅ Model converted to TensorFlow Lite format.")


Saved artifact at '/tmp/tmpcamfc651'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 5), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)
Captures:
  135868139345040: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868139351952: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868139347728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868139350224: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181811792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181811024: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181810256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181807376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181812368: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181810448: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135868181809872: TensorSp

In [17]:
from google.colab import files
files.download('anomaly_model.tflite')  # Download the TensorFlow Lite model


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
from google.colab import files

# Download the Keras model
files.download('anomaly_model.keras')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>