# Question 2

In [20]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [21]:
# data loading
df = pd.read_csv(r"canada_natural_gas_consumption.csv")
df

Unnamed: 0,Month,Consumptions
0,16-Nov,15829899.5
1,16-Dec,17682496.4
2,17-Jan,17248195.1
3,17-Feb,15385845.5
4,17-Mar,17165011.8
...,...,...
81,23-Aug,18920804.9
82,23-Sep,17898675.1
83,23-Oct,19115237.6
84,23-Nov,19136435.2


## 1. Data Gathering
This work collects the Canadian national monthly natural gas consumption from November 2016 to December 2023. The data is downloaded from the Statistic Canada webpage.
*Statistics Canada. Table 25-10-0055-01 Supply and disposition of natural gas, monthly (data in thousands) (x 1,000)*

## 2. Data preprocessing
## Task 1: Remove invalid data points

In [22]:
# remove data point with invalid value (equal to zero or has no value)
df = df[(df["Consumptions"] != 0) & (df["Consumptions"] != "")]
print(df.shape)

(86, 2)


## Task 2: Data Normalization

In [23]:
# gas consumption data normalization
scaler = MinMaxScaler(feature_range=(-1, 1))
df['Consumptions'] = scaler.fit_transform(df['Consumptions'].values.reshape(-1, 1))

In [24]:
# verify the data range
print(np.max(df['Consumptions']))
print(np.min(df['Consumptions']))

1.0
-1.0


## Task 3: Convert Year-Month to numeric value and build the sequence

In [25]:
# Convert year-month string to numerical values
from datetime import datetime


def year_month_to_numeric(year_month):
    date_obj = datetime.strptime(year_month, '%y-%b')
    numeric_representation = date_obj.year * 12 + date_obj.month
    return numeric_representation


df['Month'] = df['Month'].apply(year_month_to_numeric)

df

Unnamed: 0,Month,Consumptions
0,24203,-0.723326
1,24204,0.015767
2,24205,-0.157498
3,24206,-0.900481
4,24207,-0.190683
...,...,...
81,24284,0.509789
82,24285,0.102011
83,24286,0.587358
84,24287,0.595815


In [26]:
# Convert to a numpy array
dataset = df[['Month', 'Consumptions']].to_numpy()

# Define sequence length (number of time steps to look back)
seq_length = 5


# Create sequences
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)


# Create input sequences and labels
sequences = create_sequences(dataset, seq_length)
X = sequences[:, :-1]
y = sequences[:, -1][:, -1]

## Task 4: Train, test, validation dataset splitting
In such a case, we are going to split the dataset into (0.8,0.1,0.1) in respect to the (X_train, X_val, X_test) 