<a href="https://colab.research.google.com/github/2303A52201/Gen-AI/blob/main/GNAI_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import seaborn as sns


# Load the dataset
df = pd.read_csv('/content/archive.zip')

# Print the actual column names to verify
print(df.columns)

# Select features and target
# Use actual column names from the printed output,
# adjusting for case sensitivity if necessary
features = df.drop(columns=df.columns[df.columns.str.startswith('PM')], axis=1)
target = df[df.columns[df.columns.str.startswith('PM')]]


# One-hot encode categorical features
non_numeric_cols = features.select_dtypes(include=['object', 'category']).columns
features = pd.get_dummies(features, columns=non_numeric_cols)

# Train-test split
X_train_full, X_test_full, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Build the ANN model
def build_ann(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(3))  # 3 output units for PM2.5, PM10, and NO2
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Correlation Thresholding
cor_matrix = X_train_full.corr()
high_corr_vars = set()
for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if abs(cor_matrix.iloc[i, j]) > 0.85:
            high_corr_vars.add(cor_matrix.columns[i])

X_train_corr = X_train_full.drop(columns=high_corr_vars)
X_test_corr = X_test_full[X_train_corr.columns]

# Scale features and target
scaler_corr = StandardScaler()
X_train_corr_scaled = scaler_corr.fit_transform(X_train_corr)
X_test_corr_scaled = scaler_corr.transform(X_test_corr)

target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

# Train model with correlation thresholding
model_corr = build_ann(X_train_corr_scaled.shape[1])
history_corr = model_corr.fit(X_train_corr_scaled, y_train_scaled, epochs=100, batch_size=16,
                              validation_data=(X_test_corr_scaled, y_test_scaled), verbose=0)

# Mutual Information Feature Selection
mi_scores = np.zeros(X_train_full.shape[1])
for i in range(y_train.shape[1]):
    mi_scores += mutual_info_regression(X_train_full, y_train.iloc[:, i])
mi_scores /= y_train.shape[1]

mi_idx = np.argsort(mi_scores)[::-1][:10]
X_train_mi = X_train_full.iloc[:, mi_idx]
X_test_mi = X_test_full[X_train_mi.columns]

scaler_mi = StandardScaler()
X_train_mi_scaled = scaler_mi.fit_transform(X_train_mi)
X_test_mi_scaled = scaler_mi.transform(X_test_mi)

# Train model with Mutual Information features
model_mi = build_ann(X_train_mi_scaled.shape[1])
history_mi = model_mi.fit(X_train_mi_scaled, y_train_scaled, epochs=10, batch_size=16,
                          validation_data=(X_test_mi_scaled, y_test_scaled), verbose=0)

# Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=LinearRegression(), n_features_to_select=10)
rfe_selector = rfe_selector.fit(X_train_full, y_train)
X_train_rfe = X_train_full.loc[:, rfe_selector.support_]
X_test_rfe = X_test_full[X_train_rfe.columns]

scaler_rfe = StandardScaler()
X_train_rfe_scaled = scaler_rfe.fit_transform(X_train_rfe)
X_test_rfe_scaled = scaler_rfe.transform(X_test_rfe)

# Train model with RFE features
model_rfe = build_ann(X_train_rfe_scaled.shape[1])
history_rfe = model_rfe.fit(X_train_rfe_scaled, y_train_scaled, epochs=10, batch_size=16,
                            validation_data=(X_test_rfe_scaled, y_test_scaled), verbose=0)

# Plot history for each model
def plot_history(history, title):
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_history(history_corr, 'Model with Correlation Thresholding')
plot_history(history_mi, 'Model with Mutual Information')
plot_history(history_rfe, 'Model with RFE')

# Evaluate RMSE for each model
def evaluate_rmse(model, X_test_scaled, y_test_scaled, title):
    y_pred_scaled = model.predict(X_test_scaled)
    y_pred = target_scaler.inverse_transform(y_pred_scaled)
    y_true = target_scaler.inverse_transform(y_test_scaled)
    rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
    print(f"{title} - Final RMSE: {rmse:.4f}")

evaluate_rmse(model_corr, X_test_corr_scaled, y_test_scaled, "Correlation Thresholding")
evaluate_rmse(model_mi, X_test_mi_scaled, y_test_scaled, "Mutual Information")
evaluate_rmse(model_rfe, X_test_rfe_scaled, y_test_scaled, "RFE")
