<a href="https://colab.research.google.com/github/28nahidhasan/ML_final-project/blob/main/ML_project(Nahid).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DATA PREPROCESSING

In [None]:
!pip install kaggle scikit-learn tensorflow imbalanced-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from google.colab import files

# Load and merge datasets

In [None]:
uploaded = files.upload()
file_list = [f for f in uploaded.keys() if 'Dhaka_PM2.5' in f]
dfs = [pd.read_csv(f) for f in file_list]
merged_df = pd.concat(dfs, ignore_index=True)


In [None]:
merged_df = merged_df[merged_df['AQI'] != -999]
merged_df['AQI Category'] = merged_df['AQI Category'].fillna('Unknown')

In [None]:
# Feature engineering
merged_df['Date (LT)'] = pd.to_datetime(merged_df['Date (LT)'], format='%d/%m/%Y %H:%M')
merged_df['Hour'] = merged_df['Date (LT)'].dt.hour
merged_df['Month'] = merged_df['Date (LT)'].dt.month
merged_df['DayOfWeek'] = merged_df['Date (LT)'].dt.dayofweek

# Create classification target
bins = [0, 50, 100, 150, 200, 300, np.inf]
labels = ['Good','Moderate','USG','Unhealthy','VUnhealthy','Hazardous']
merged_df['AQI_Class'] = pd.cut(merged_df['AQI'], bins=bins, labels=labels)



In [None]:
# Encode labels
le = LabelEncoder()
merged_df['AQI_Class_Encoded'] = le.fit_transform(merged_df['AQI_Class'])

# Save processed data
processed_filename = 'processed_air_quality.csv'
merged_df.to_csv(processed_filename, index=False)
files.download(processed_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# DATA SPLITTING

In [None]:
features = ['NowCast Conc.', 'Raw Conc.', 'Hour', 'Month', 'DayOfWeek']
target = 'AQI_Class_Encoded'

X = merged_df[features]
y = merged_df[target]

# Split data (70% train, 15% validation, 15% test)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42  # 0.15/0.85 ≈ 0.1765
)

# Handle class imbalance
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# FOCAL LOSS IMPLEMENTATION

In [None]:
def focal_loss(gamma=2., alpha=4.):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * tf.math.log(y_pred)
        loss = alpha * tf.pow(1. - y_pred, gamma) * cross_entropy
        return tf.reduce_mean(loss, axis=-1)
    return focal_loss_fixed


# Models

Neural Network

In [None]:
# NEURAL NETWORK MODEL
def create_nn_model(input_shape, n_classes):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.4),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(n_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=focal_loss(),
        metrics=['accuracy']
    )
    return model

# Preprocess data for NN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train NN
nn_model = create_nn_model(X_train_scaled.shape[1], len(le.classes_))
early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

history = nn_model.fit(
    X_train_scaled, tf.keras.utils.to_categorical(y_train),
    validation_data=(X_val_scaled, tf.keras.utils.to_categorical(y_val)),
    epochs=200,
    batch_size=128,
    callbacks=[early_stop],
    verbose=1
)


TRADITIONAL MODELS

In [None]:
models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced')
}

param_grids = {
    'Random Forest': {
        'n_estimators': [200, 300],
        'max_depth': [None, 20],
        'min_samples_split': [2, 3]
    },
    'KNN': {
        'n_neighbors': [3, 5],
        'weights': ['distance']
    },
    'Decision Tree': {
        'max_depth': [None],
        'min_samples_split': [2]
    }
}

best_models = {}
for name in models:
    grid = GridSearchCV(
        models[name],
        param_grids[name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} Best Params: {grid.best_params_}")
    print(f"{name} Best CV Accuracy: {grid.best_score_:.4f}\n")


# EVALUATION & VISUALIZATION

In [None]:
# Training History Plot
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Neural Network Training History')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()





In [None]:
# Model Comparison
results = {}
for name, model in best_models.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)

    results[name] = {
        'Train Accuracy': accuracy_score(y_train, model.predict(X_train)),
        'Validation Accuracy': accuracy_score(y_val, val_pred),
        'Test Accuracy': accuracy_score(y_test, test_pred)
    }

In [None]:
# Neural Network Evaluation
nn_val_pred = np.argmax(nn_model.predict(X_val_scaled), axis=1)
nn_test_pred = np.argmax(nn_model.predict(X_test_scaled), axis=1)
results['Neural Network'] = {
    'Train Accuracy': accuracy_score(y_train, np.argmax(nn_model.predict(X_train_scaled), axis=1)),
    'Validation Accuracy': accuracy_score(y_val, nn_val_pred),
    'Test Accuracy': accuracy_score(y_test, nn_test_pred)
}

Results DataFrame

In [None]:
# Results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Test Accuracy', ascending=False)

# Accuracy Comparison Plot
plt.figure(figsize=(14, 6))
results_df[['Train Accuracy', 'Validation Accuracy', 'Test Accuracy']].plot(kind='bar')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0.95, 1.005)
plt.xticks(rotation=45)
plt.show()



# Confusion Matrix for Best Model

In [None]:
# Confusion Matrix for Best Model
best_model_name = results_df.index[0]
if best_model_name == 'Neural Network':
    y_pred = nn_test_pred
else:
    y_pred = best_models[best_model_name].predict(X_test)

plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred),
            annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Best Model Confusion Matrix ({best_model_name})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


# Feature Importance

In [None]:
# Feature Importance (For Tree-based Models)
if 'Random Forest' in best_models:
    plt.figure(figsize=(10, 6))
    importances = best_models['Random Forest'].feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.title("Feature Importances (Random Forest)")
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.gca().invert_yaxis()
    plt.show()

# AQI Distribution Plot

In [None]:
# AQI Distribution Plot
plt.figure(figsize=(10, 6))
merged_df['AQI_Class'].value_counts().plot(kind='bar', color='teal')
plt.title('AQI Class Distribution')
plt.xlabel('Air Quality Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# FUTURE PREDICTION

main

In [None]:
# First ensure these imports are present
import pandas as pd
import numpy as np
from datetime import datetime

#  AQI CALCULATION & HEALTH ADVISORY
def calculate_aqi_class(aqi_value):
    """Official EPA AQI classification"""
    if aqi_value <= 50:
        return 'Good'
    elif aqi_value <= 100:
        return 'Moderate'
    elif aqi_value <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi_value <= 200:
        return 'Unhealthy'
    elif aqi_value <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

def health_status(aqi_class, aqi_value):
    """Enhanced health advisory with actionable advice"""
    advisories = {
        'Good': {
            'action': "Enjoy outdoor activities",
            'advice': ["No restrictions needed"]
        },
        'Moderate': {
            'action': "Sensitive people should reduce prolonged exertion",
            'advice': ["Consider outdoor activity timing", "Monitor air quality changes"]
        },
        'Unhealthy for Sensitive Groups': {
            'action': "Children & elderly should limit outdoor activity",
            'advice': ["Close windows during peak hours", "Use air purifiers"]
        },
        'Unhealthy': {
            'action': "Everyone should limit outdoor exertion",
            'advice': ["Avoid strenuous activities", "Wear N95 masks outdoors"]
        },
        'Very Unhealthy': {
            'action': "Avoid all outdoor activities",
            'advice': ["Seal windows and doors", "Run air purifiers continuously"]
        },
        'Hazardous': {
            'action': "Remain indoors with filtered air",
            'advice': ["Seek alternative shelter if needed", "Follow emergency advisories"]
        }
    }

    status = advisories.get(aqi_class, {'action': 'Unknown', 'advice': []})
    return (
        f"AQI: {aqi_value:.0f} ({aqi_class})\n"
        f"Immediate Action: {status['action']}\n"
        "Recommended Steps:\n- " + '\n- '.join(status['advice'])
    )

# ENHANCED PREDICTION INTERFACE
def get_date_prediction():
    print("\n=== Date-Based AQI Forecast ===")
    try:
        # Improved date parsing
        date_str = input("Enter prediction date (DD/MM/YYYY HH:MM): ")
        dt = pd.to_datetime(date_str, dayfirst=True, format='mixed')

        # Validate date range
        if dt < pd.to_datetime('2016-01-01') or dt > pd.to_datetime('2025-12-31'):
            raise ValueError("Date out of model range (2016-2025)")

        # Get historical patterns (ensure historical_df exists)
        # You need to define historical_df and merged_df with proper data
        mask = (
            (historical_df['Month'] == dt.month) &
            (historical_df['DayOfWeek'] == dt.weekday()) &
            (historical_df['Hour'] == dt.hour)
        )

        if mask.any():
            features = historical_df[mask].iloc[0][['NowCast Conc.', 'Raw Conc.']]
        else:
            features = pd.Series({
                'NowCast Conc.': merged_df['NowCast Conc.'].median(),
                'Raw Conc.': merged_df['Raw Conc.'].median()
            })

        # Official AQI calculation (using PM2.5 conversion)
        pm25 = features['NowCast Conc.']
        aqi_value = official_aqi_calculation(pm25)
        aqi_class = calculate_aqi_class(aqi_value)

        # Display results
        print(f"\n=== Prediction for {dt.strftime('%a %d %b %Y %H:%M')} ===")
        print(f"Estimated PM2.5: {pm25:.1f} µg/m³")
        print(health_status(aqi_class, aqi_value))
        print("="*50)

    except Exception as e:
        print(f"\n⚠️ Error: {str(e)}")

#  OFFICIAL AQI CALCULATION
def official_aqi_calculation(pm25):
    """EPA's official PM2.5 to AQI conversion"""
    breakpoints = [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 500.4, 301, 500)
    ]

    for bp in breakpoints:
        if bp[0] <= pm25 <= bp[1]:
            return ((bp[3] - bp[2])/(bp[1] - bp[0])) * (pm25 - bp[0]) + bp[2]
    return 500

# Example usage
get_date_prediction()