In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/My Drive/')

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('city_day.csv')
print(df)

                City        Date  PM2.5   PM10     NO    NO2    NOx    NH3  \
0          Ahmedabad  2015-01-01    NaN    NaN   0.92  18.22  17.15    NaN   
1          Ahmedabad  2015-01-02    NaN    NaN   0.97  15.69  16.46    NaN   
2          Ahmedabad  2015-01-03    NaN    NaN  17.40  19.30  29.70    NaN   
3          Ahmedabad  2015-01-04    NaN    NaN   1.70  18.48  17.97    NaN   
4          Ahmedabad  2015-01-05    NaN    NaN  22.10  21.42  37.76    NaN   
...              ...         ...    ...    ...    ...    ...    ...    ...   
29526  Visakhapatnam  2020-06-27  15.02  50.94   7.68  25.06  19.54  12.47   
29527  Visakhapatnam  2020-06-28  24.38  74.09   3.42  26.06  16.53  11.99   
29528  Visakhapatnam  2020-06-29  22.91  65.73   3.45  29.53  18.33  10.71   
29529  Visakhapatnam  2020-06-30  16.64  49.97   4.05  29.26  18.80  10.03   
29530  Visakhapatnam  2020-07-01  15.00  66.00   0.40  26.85  14.05   5.20   

          CO    SO2      O3  Benzene  Toluene  Xylene   AQI    

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [None]:
# 2. Count missing values in each column
print("Missing Values Count:")
print(df.isnull().sum())

Missing Values Count:
City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64


In [None]:
# 1. Drop rows where the target variable 'PM2.5' is missing
df.dropna(subset=['PM2.5'], inplace=True)
print(df.isnull().sum())

City              0
Date              0
PM2.5             0
PM10           7301
NO              351
NO2             377
NOx            1470
NH3            6644
CO              370
SO2             505
O3              752
Benzene        3151
Toluene        5555
Xylene        15273
AQI             761
AQI_Bucket      761
dtype: int64


In [None]:
# 2. Impute missing values for other numerical columns using their median
# Get a list of numerical columns (excluding the target and object types)
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('PM2.5')
print(numerical_cols)

['PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']


In [None]:
for col in numerical_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
print(df.isnull().sum())

City            0
Date            0
PM2.5           0
PM10            0
NO              0
NO2             0
NOx             0
NH3             0
CO              0
SO2             0
O3              0
Benzene         0
Toluene         0
Xylene          0
AQI             0
AQI_Bucket    761
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [None]:
# Convert 'Date' column from object to datetime
df['Date'] = pd.to_datetime(df['Date'])

print("\nData types after converting 'Date' column:")
df.info()


Data types after converting 'Date' column:
<class 'pandas.core.frame.DataFrame'>
Index: 24933 entries, 27 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   City        24933 non-null  object        
 1   Date        24933 non-null  datetime64[ns]
 2   PM2.5       24933 non-null  float64       
 3   PM10        24933 non-null  float64       
 4   NO          24933 non-null  float64       
 5   NO2         24933 non-null  float64       
 6   NOx         24933 non-null  float64       
 7   NH3         24933 non-null  float64       
 8   CO          24933 non-null  float64       
 9   SO2         24933 non-null  float64       
 10  O3          24933 non-null  float64       
 11  Benzene     24933 non-null  float64       
 12  Toluene     24933 non-null  float64       
 13  Xylene      24933 non-null  float64       
 14  AQI         24933 non-null  float64       
 15  AQI_Bucket  24172 non-null  ob

In [None]:
# Extract features from the 'Date' column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [None]:
df['City']

Unnamed: 0,City
27,Ahmedabad
28,Ahmedabad
29,Ahmedabad
30,Ahmedabad
31,Ahmedabad
...,...
29526,Visakhapatnam
29527,Visakhapatnam
29528,Visakhapatnam
29529,Visakhapatnam


In [None]:
# One-Hot Encode the 'City' column
df = pd.get_dummies(df, columns=['City'], drop_first=True)

In [None]:

# Drop columns that are not needed or cause data leakage
df.drop(['Date', 'AQI', 'AQI_Bucket'], axis=1, inplace=True)

print("\nDataFrame after Feature Engineering:")
print(df.head())


DataFrame after Feature Engineering:
     PM2.5    PM10     NO    NO2    NOx    NH3     CO    SO2      O3  Benzene  \
27   73.24  95.595   5.72  21.11  25.84  16.53   5.72  36.52   62.42     0.03   
28   83.13  95.595   6.93  28.71  33.72  16.53   6.93  49.52   59.76     0.02   
29   79.84  95.595  13.85  28.68  41.08  16.53  13.85  48.49   97.07     0.04   
30   94.52  95.595  24.39  32.66  52.61  16.53  24.39  67.39  111.33     0.24   
31  135.99  95.595  43.48  42.08  84.57  16.53  43.48  75.23  102.70     0.40   

    ...  City_Jorapokhar  City_Kochi  City_Kolkata  City_Lucknow  City_Mumbai  \
27  ...            False       False         False         False        False   
28  ...            False       False         False         False        False   
29  ...            False       False         False         False        False   
30  ...            False       False         False         False        False   
31  ...            False       False         False         False      

In [None]:
# Define your features (X) and target (y)
X = df.drop('PM2.5', axis=1)
y = df['PM2.5']

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Transform the test data using the SAME scaler
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining data shape: {X_train_scaled.shape}")
print(f"Testing data shape: {X_test_scaled.shape}")


Training data shape: (19946, 39)
Testing data shape: (4987, 39)


In [None]:
print(X_train_scaled)

[[-0.72498815 -0.68265466 -0.73996923 ... -0.17776592 -0.2129156
  -0.22731347]
 [-0.20480822  2.65488744  3.67108744 ... -0.17776592 -0.2129156
  -0.22731347]
 [ 0.69935018  1.00378218  1.58090758 ... -0.17776592 -0.2129156
  -0.22731347]
 ...
 [-0.20480822  2.61007665  2.32249194 ... -0.17776592 -0.2129156
  -0.22731347]
 [-1.00478539 -0.24531859 -0.35179297 ... -0.17776592 -0.2129156
  -0.22731347]
 [-1.05324485 -0.4986719  -0.75224035 ... -0.17776592  4.69669666
  -0.22731347]]


In [None]:
import json
from sklearn.linear_model import LinearRegression # Changed from LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score # Imported regression metrics

# Train a Linear Regression model
# Linear Regression is a simple model with limited hyperparameters for optimization.
# To potentially reduce the error rate significantly, consider trying other regression models
# like Ridge, Lasso, Elastic Net, Decision Tree Regressor, Random Forest Regressor,
# or Gradient Boosting models. You could also explore feature engineering or hyperparameter tuning.
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False) # Calculate RMSE
r2 = r2_score(y_test, y_pred)

# Create an evaluation report dictionary for regression
evaluation_report = {
    "mean_squared_error": mse,
    "rmse": rmse,
    "r2_score": r2,
}

# Export the evaluation report to a JSON file
with open('linear_regression_evaluation_report.json', 'w') as f:
    json.dump(evaluation_report, f, indent=4)

print("Evaluation report exported to linear_regression_evaluation_report.json")

Evaluation report exported to linear_regression_evaluation_report.json


In [None]:
print(evaluation_report)

{'mean_squared_error': 2108.3971242074217, 'rmse': np.float64(45.91728567987683), 'r2_score': 0.5078555964030003}


In [50]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

# We'll still use these for the final evaluation
from sklearn.metrics import mean_squared_error, r2_score

In [49]:
!pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [51]:
def build_model(hp):
    """
    This function builds a Keras model and defines the hyperparameters we want to tune.
    'hp' is a special object provided by KerasTuner that allows us to define searchable ranges.
    """
    model = keras.Sequential()

    # 1. Tune the number of units (neurons) in the first hidden layer.
    # We'll search for an integer between 32 and 128.
    hp_units = hp.Int('units', min_value=32, max_value=128, step=16)

    # Add the input layer and first hidden layer
    # The input_shape must match the number of features in your data (X_train_scaled.shape[1])
    model.add(layers.Dense(units=hp_units, activation='relu', input_shape=[X_train_scaled.shape[1]]))

    # Add a second hidden layer for a deeper model
    hp_units_2 = hp.Int('units_2', min_value=16, max_value=64, step=16)
    model.add(layers.Dense(units=hp_units_2, activation='relu'))

    # Add the output layer. It has 1 neuron because we are predicting a single value (PM2.5).
    model.add(layers.Dense(1))

    # 2. Tune the learning rate for the Adam optimizer.
    # We'll try three different values: 0.01, 0.001, or 0.0001.
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Compile the model
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='mean_squared_error', # A common loss function for regression
                  metrics=['mean_squared_error'])

    return model

In [52]:
# Instantiate the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss', # The metric to minimize
    max_trials=10,        # The total number of hyperparameter combinations to test
    executions_per_trial=2, # The number of models to train and evaluate for each trial
    directory='my_dir',   # A directory to store the results
    project_name='pm25_tuning'
)

# A callback to stop training early if the validation loss stops improving
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Start the search!
print("Starting hyperparameter search...")
tuner.search(X_train_scaled, y_train,
             epochs=50,
             validation_split=0.2, # Use 20% of training data for validation
             callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Search complete! 🎉
The optimal number of units in the first layer is {best_hps.get('units')}.
The optimal number of units in the second layer is {best_hps.get('units_2')}.
The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")

Trial 10 Complete [00h 01m 45s]
val_loss: 1312.8131713867188

Best val_loss So Far: 1237.4491577148438
Total elapsed time: 00h 15m 30s

Search complete! 🎉
The optimal number of units in the first layer is 80.
The optimal number of units in the second layer is 32.
The optimal learning rate for the optimizer is 0.001.



In [53]:
# Build the model with the optimal hyperparameters
final_model = tuner.hypermodel.build(best_hps)

# Train the final model
print("\nTraining the final model with the best hyperparameters...")
history = final_model.fit(X_train_scaled, y_train,
                          epochs=100, # Train for more epochs on the final model
                          validation_split=0.2,
                          callbacks=[stop_early],
                          verbose=0) # verbose=0 silences the output for each epoch

print("✅ Final model trained successfully!")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training the final model with the best hyperparameters...
✅ Final model trained successfully!


In [54]:
# Make predictions on the test set
y_pred_nn = final_model.predict(X_test_scaled)

# Calculate performance metrics
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"\n--- Final Model Evaluation on Test Data ---")
print(f"Mean Squared Error (MSE): {mse_nn:.2f}")
print(f"R-squared (R²): {r2_nn:.2f}")

# Compare with the previous Linear Regression model
print(f"\n--- Comparison ---")
print(f"Simple Linear Regression R²: {r2:.2f}") # 'r2' is from your previous code
print(f"Fine-Tuned Neural Network R²: {r2_nn:.2f}")

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 941us/step

--- Final Model Evaluation on Test Data ---
Mean Squared Error (MSE): 1064.79
R-squared (R²): 0.75

--- Comparison ---
Simple Linear Regression R²: 0.51
Fine-Tuned Neural Network R²: 0.75
