In [None]:
import os
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')

# Check if running in Google Colab
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

# Define the file path
file_name = 'polution for students.csv'
dir_path = os.getcwd()

if IN_COLAB:
    # Mount Google Drive
    drive.mount('/content/gdrive')
    dir_path = '/content/gdrive/My Drive'
    data_dir = os.path.join(dir_path, 'DeepLearning/student pollution')
    file_path = os.path.join(data_dir, file_name)
else:
    # Local environment
    data_dir = os.path.join(dir_path, 'student pollution')
    file_path = os.path.join(data_dir, file_name)

# Check if the file exists
if not os.path.exists(file_path):
    print("File not found:", file_path)
else:
    # Load the dataset
    dataset = pd.read_csv(file_path)

    print(dataset.head())

In [None]:
from pandas import to_datetime

# Combining year, month, day, and hour into a datetime index
dataset['date'] = to_datetime(dataset[['year', 'month', 'day', 'hour']])
dataset.set_index('date', inplace=True)

# Dropping unnecessary columns
dataset.drop(['No', 'year', 'month', 'day', 'hour', 'station'], axis=1, inplace=True)

# Handling missing values
dataset.fillna(0, inplace=True)

# Encoding categorical data
dataset['wd'] = dataset['wd'].astype('category').cat.codes

# Normalize the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(dataset)

# Confirming changes
print(dataset.head())


In [None]:
from pandas import DataFrame, concat

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()

    # Input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

    # Forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

    # Put it all together
    agg = concat(cols, axis=1)
    agg.columns = names

    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    return agg



In [None]:
# Using series_to_supervised for 2-day ahead prediction
reframed_2_days = series_to_supervised(scaled, n_in=1, n_out=2)

# Drop columns not needed for 2-day prediction
pm25_col_index = 0
n_features = scaled.shape[1]
columns_to_drop = [f'var{j}(t+1)' for j in range(1, n_features + 1) if j != pm25_col_index]

reframed_2_days.drop(columns_to_drop, axis=1, inplace=True)


# Drop columns we don't need for 3-day prediction

reframed_3_days = series_to_supervised(scaled, n_in=2, n_out=3)

columns_to_drop_3_days = [f'var{j}(t+{i})' for i in [1, 2] for j in range(1, n_features + 1) if j != pm25_col_index]

reframed_3_days.drop(columns_to_drop_3_days, axis=1, inplace=True)


print(reframed_2_days.head())
print(reframed_3_days.head())


In [None]:
# Splitting the data
values_2_days = reframed_2_days.values
n_train_hours = int(round(len(values_2_days) * 0.6))
n_valid_hours = int(round(len(values_2_days) * 0.8))

train_2_days = values_2_days[:n_train_hours, :]
valid_2_days = values_2_days[n_train_hours:n_valid_hours, :]
test_2_days = values_2_days[n_valid_hours:, :]

# Split into input and output
n_obs = n_features * 1  # Number of features times number of input time steps

train_X_2_days, train_y_2_days = train_2_days[:, :n_obs], train_2_days[:, -n_features]
valid_X_2_days, valid_y_2_days = valid_2_days[:, :n_obs], valid_2_days[:, -n_features]
test_X_2_days, test_y_2_days = test_2_days[:, :n_obs], test_2_days[:, -n_features]

# Reshape input to be 3D [samples, timesteps, features]
train_X_2_days = train_X_2_days.reshape((train_X_2_days.shape[0], 1, train_X_2_days.shape[1]))
valid_X_2_days = valid_X_2_days.reshape((valid_X_2_days.shape[0], 1, valid_X_2_days.shape[1]))
test_X_2_days = test_X_2_days.reshape((test_X_2_days.shape[0], 1, test_X_2_days.shape[1]))


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense


# Define the LSTM model
model_2_days = Sequential()
model_2_days.add(LSTM(50, input_shape=(train_X_2_days.shape[1], train_X_2_days.shape[2])))
model_2_days.add(Dense(1))
model_2_days.compile(loss='mae', optimizer='adam', metrics=['mse'])

# Fit the model
history_2_days = model_2_days.fit(train_X_2_days, train_y_2_days, epochs=50, batch_size=90,
                                  validation_data=(valid_X_2_days, valid_y_2_days), verbose=2, shuffle=False)

# Evaluate the model
test_loss_2_days = model_2_days.evaluate(test_X_2_days, test_y_2_days, verbose=0)
print('Test Loss for 2-day prediction:', test_loss_2_days)



In [None]:
# Splitting the data for 3-day prediction
values_3_days = reframed_3_days.values
n_train_hours = int(round(len(values_3_days) * 0.6))
n_valid_hours = int(round(len(values_3_days) * 0.8))

train_3_days = values_3_days[:n_train_hours, :]
valid_3_days = values_3_days[n_train_hours:n_valid_hours, :]
test_3_days = values_3_days[n_valid_hours:, :]

# Split into input and output
n_obs = n_features * 2  # Number of features times number of input time steps for 3-day prediction

train_X_3_days, train_y_3_days = train_3_days[:, :n_obs], train_3_days[:, -n_features]
valid_X_3_days, valid_y_3_days = valid_3_days[:, :n_obs], valid_3_days[:, -n_features]
test_X_3_days, test_y_3_days = test_3_days[:, :n_obs], test_3_days[:, -n_features]

# Reshape input to be 3D [samples, timesteps, features]
train_X_3_days = train_X_3_days.reshape((train_X_3_days.shape[0], 2, int(train_X_3_days.shape[1]/2)))
valid_X_3_days = valid_X_3_days.reshape((valid_X_3_days.shape[0], 2, int(valid_X_3_days.shape[1]/2)))
test_X_3_days = test_X_3_days.reshape((test_X_3_days.shape[0], 2, int(test_X_3_days.shape[1]/2)))


In [None]:
# Define the LSTM model for 3-day prediction
model_3_days = Sequential()
model_3_days.add(LSTM(50, input_shape=(train_X_3_days.shape[1], train_X_3_days.shape[2])))
model_3_days.add(Dense(1))
model_3_days.compile(loss='mae', optimizer='adam',metrics=['mse'])

# Fit the model
history_3_days = model_3_days.fit(train_X_3_days, train_y_3_days, epochs=50, batch_size=90,
                                  validation_data=(valid_X_3_days, valid_y_3_days), verbose=2, shuffle=False)

# Evaluate the model
test_loss_3_days = model_3_days.evaluate(test_X_3_days, test_y_3_days, verbose=0)
print('Test Loss for 3-day prediction:', test_loss_3_days)


In [None]:
import matplotlib.pyplot as plt

#2-day prediction

# MAE plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_2_days.history['loss'], label='Train MAE')
plt.plot(history_2_days.history['val_loss'], label='Validation MAE')
plt.title('2-Day Prediction Model MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()

# MSE plot
plt.subplot(1, 2, 2)
plt.plot(history_2_days.history['mse'], label='Train MSE')
plt.plot(history_2_days.history['val_mse'], label='Validation MSE')
plt.title('2-Day Prediction Model MSE')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()


In [None]:
#for 3-day prediction

# MAE plot

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_3_days.history['loss'], label='Train MAE')
plt.plot(history_3_days.history['val_loss'], label='Validation MAE')
plt.title('3-Day Prediction Model MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()

# MSE plot
plt.subplot(1, 2, 2)
plt.plot(history_3_days.history['mse'], label='Train MSE')
plt.plot(history_3_days.history['val_mse'], label='Validation MSE')
plt.title('3-Day Prediction Model MSE')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()


In [None]:
from keras.metrics import mean_squared_error
import numpy as np

# Predictions for 2-day and 3-day models
test_pred_2_days = model_2_days.predict(test_X_2_days)
test_pred_3_days = model_3_days.predict(test_X_3_days)

def calculate_rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

# RMSE for the scaled test set
rmse_scaled_2_days = calculate_rmse(test_y_2_days, test_pred_2_days)
rmse_scaled_3_days = calculate_rmse(test_y_3_days, test_pred_3_days)

print('Test RMSE (Scaled) - 2 days:', rmse_scaled_2_days)
print('Test RMSE (Scaled) - 3 days:', rmse_scaled_3_days)

# inverse transform
test_y_2_days_2d = test_y_2_days.reshape(len(test_y_2_days), 1)
test_y_3_days_2d = test_y_3_days.reshape(len(test_y_3_days), 1)

# Invert scaling for actual values
test_y_2_days_inv = scaler.inverse_transform(np.concatenate((test_y_2_days_2d, np.zeros((len(test_y_2_days_2d), n_features-1))), axis=1))[:, 0]
test_y_3_days_inv = scaler.inverse_transform(np.concatenate((test_y_3_days_2d, np.zeros((len(test_y_3_days_2d), n_features-1))), axis=1))[:, 0]

# Invert scaling for predicted values
test_pred_2_days_inv = scaler.inverse_transform(np.concatenate((test_pred_2_days, np.zeros((len(test_pred_2_days), n_features-1))), axis=1))[:, 0]
test_pred_3_days_inv = scaler.inverse_transform(np.concatenate((test_pred_3_days, np.zeros((len(test_pred_3_days), n_features-1))), axis=1))[:, 0]

# Calculate RMSE for absolute values
rmse_absolute_2_days = calculate_rmse(test_y_2_days_inv, test_pred_2_days_inv)
rmse_absolute_3_days = calculate_rmse(test_y_3_days_inv, test_pred_3_days_inv)

print('Test RMSE (Absolute) - 2 days:', rmse_absolute_2_days)
print('Test RMSE (Absolute) - 3 days:', rmse_absolute_3_days)
