In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

# Load the data set and get the parameters and labels 

In [2]:
weather_data = pd.read_csv('./data/SortedWeatherData_TorontoLabel.csv')

In [3]:
weather_data.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id_x,weather_main,weather_description,weather_icon,weather_id_y
0,1684281600,2023-05-17 00:00:00 +0000 UTC,-14400,Barrie,44.389356,-79.690332,10.0,,3.73,7.03,...,,,,,78,803,Clouds,broken clouds,04d,800
1,1684281600,2023-05-17 00:00:00 +0000 UTC,-14400,Niagara Falls,43.089558,-79.084944,15.11,10000.0,7.41,14.24,...,,,,,75,803,Clouds,broken clouds,04d,800
2,1684281600,2023-05-17 00:00:00 +0000 UTC,-14400,Kitchener,43.451639,-80.492534,12.52,10000.0,7.41,11.68,...,,,,,0,800,Clear,sky is clear,01d,800
3,1684281600,2023-05-17 00:00:00 +0000 UTC,-14400,Peterborough,44.304706,-78.319961,10.39,10000.0,4.32,9.21,...,,,,,40,802,Clouds,scattered clouds,03d,800
4,1684285200,2023-05-17 01:00:00 +0000 UTC,-14400,Peterborough,44.304706,-78.319961,8.14,10000.0,2.18,4.49,...,,,,,75,803,Clouds,broken clouds,04n,800


# Get the unique characteristics of the climate

In [4]:
# There are different IDs for the climate condition
weather_category = weather_data.get(['weather_id_x', 'weather_main', 'weather_description']).drop_duplicates(subset=['weather_id_x', 'weather_main', 'weather_description'])
#Later we can use the IDs to confirm if the weather was correctly predicted. 
weather_category.head(30)

Unnamed: 0,weather_id_x,weather_main,weather_description
0,803,Clouds,broken clouds
2,800,Clear,sky is clear
3,802,Clouds,scattered clouds
6,804,Clouds,overcast clouds
10,801,Clouds,few clouds
264,500,Rain,light rain
289,501,Rain,moderate rain
293,721,Haze,haze
303,701,Mist,mist
331,502,Rain,heavy intensity rain


In [5]:
weather_category.to_csv('./data/unique_weather_category.csv', index=False ,sep=',', header=True, encoding='utf-8')

# Preprocessing data and removing unnecessary features

In [6]:
X = weather_data.drop(['dt_iso','weather_main', 'weather_id_y', 'weather_description','weather_icon', 'timezone', 'sea_level', 'grnd_level'], axis=1)
y = weather_data["weather_id_y"]

In [7]:
X.head(10)

Unnamed: 0,dt,city_name,lat,lon,temp,visibility,dew_point,feels_like,temp_min,temp_max,...,humidity,wind_speed,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id_x
0,1684281600,Barrie,44.389356,-79.690332,10.0,,3.73,7.03,8.06,10.81,...,65,6.69,10,9.77,,,,,78,803
1,1684281600,Niagara Falls,43.089558,-79.084944,15.11,10000.0,7.41,14.24,13.33,16.12,...,60,3.6,350,,,,,,75,803
2,1684281600,Kitchener,43.451639,-80.492534,12.52,10000.0,7.41,11.68,11.87,13.12,...,71,5.66,10,8.75,,,,,0,800
3,1684281600,Peterborough,44.304706,-78.319961,10.39,10000.0,4.32,9.21,9.74,11.01,...,66,6.17,10,10.8,,,,,40,802
4,1684285200,Peterborough,44.304706,-78.319961,8.14,10000.0,2.18,4.49,7.52,9.22,...,66,7.2,10,9.77,,,,,75,803
5,1684285200,Kitchener,43.451639,-80.492534,9.28,10000.0,5.27,6.27,8.21,11.34,...,76,6.17,350,,,,,,0,800
6,1684285200,Barrie,44.389356,-79.690332,8.01,,2.05,4.48,7.1,8.88,...,66,6.69,360,,,,,,95,804
7,1684285200,Niagara Falls,43.089558,-79.084944,12.21,10000.0,7.11,11.34,11.28,12.68,...,71,6.17,340,9.77,,,,,40,802
8,1684288800,Kitchener,43.451639,-80.492534,7.65,10000.0,3.32,4.81,6.87,9.12,...,74,4.63,350,7.72,,,,,0,800
9,1684288800,Barrie,44.389356,-79.690332,6.72,,0.82,3.21,5.84,7.99,...,66,5.66,360,8.23,,,,,92,804


## Convert City_name into city_id

In [8]:
city_to_id = {city: idx for idx, city in enumerate(X["city_name"].unique())}
X["city_id"] = X["city_name"].map(city_to_id)
city_categories = X.get(["city_id", "city_name"]).drop_duplicates(subset=["city_id", "city_name"])

In [9]:
city_categories.to_csv('./data/cities_category.csv', index=False ,sep=',', header=True, encoding='utf-8')

# Drop city name

In [10]:
X = X.drop(["city_name"], axis=1)
X.head(10)

Unnamed: 0,dt,lat,lon,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,...,wind_speed,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id_x,city_id
0,1684281600,44.389356,-79.690332,10.0,,3.73,7.03,8.06,10.81,1008,...,6.69,10,9.77,,,,,78,803,0
1,1684281600,43.089558,-79.084944,15.11,10000.0,7.41,14.24,13.33,16.12,1004,...,3.6,350,,,,,,75,803,1
2,1684281600,43.451639,-80.492534,12.52,10000.0,7.41,11.68,11.87,13.12,1006,...,5.66,10,8.75,,,,,0,800,2
3,1684281600,44.304706,-78.319961,10.39,10000.0,4.32,9.21,9.74,11.01,1006,...,6.17,10,10.8,,,,,40,802,3
4,1684285200,44.304706,-78.319961,8.14,10000.0,2.18,4.49,7.52,9.22,1007,...,7.2,10,9.77,,,,,75,803,3
5,1684285200,43.451639,-80.492534,9.28,10000.0,5.27,6.27,8.21,11.34,1007,...,6.17,350,,,,,,0,800,2
6,1684285200,44.389356,-79.690332,8.01,,2.05,4.48,7.1,8.88,1009,...,6.69,360,,,,,,95,804,0
7,1684285200,43.089558,-79.084944,12.21,10000.0,7.11,11.34,11.28,12.68,1005,...,6.17,340,9.77,,,,,40,802,1
8,1684288800,43.451639,-80.492534,7.65,10000.0,3.32,4.81,6.87,9.12,1008,...,4.63,350,7.72,,,,,0,800,2
9,1684288800,44.389356,-79.690332,6.72,,0.82,3.21,5.84,7.99,1010,...,5.66,360,8.23,,,,,92,804,0


In [11]:
features = ["dt", "lat", "lon", "temp", "visibility","dew_point","feels_like","temp_min","temp_max","pressure","humidity","wind_speed","wind_deg","wind_gust","rain_1h","rain_3h","snow_1h","snow_3h","clouds_all","weather_id_x", "city_id"]
X.fillna(X.mean(), inplace=True)

In [12]:
numerical_transformer = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, features)])

In [13]:
X_processed = preprocessor.fit_transform(X)

In [14]:
y_tensor = tf.convert_to_tensor(y)

# Split the data into training, test 

In [15]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

In [16]:
leaky_relu = keras.layers.LeakyReLU(negative_slope=0.3)
#normalizer = layers.Normalization(input_shape = [1,], axis=None)
#normalizer.adapt(X_train)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(21,)))
model.add(tf.keras.layers.Dense(30, activation='relu'))
model.add(tf.keras.layers.Dense(30, activation=leaky_relu, kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(1))


In [17]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [18]:
model.summary()

In [19]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("TorontoWeatherModel.keras",
 save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
 restore_best_weights=True)

In [20]:
history = model.fit(X_train, y_train, epochs=100,
 validation_data=(X_valid, y_valid),
 callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 572741405114368.0000 - mae: 8836067.0000 - val_loss: 12921.9043 - val_mae: 87.8449
Epoch 2/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 13235.8574 - mae: 90.4082 - val_loss: 12982.2334 - val_mae: 83.1342
Epoch 3/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 947us/step - loss: 13450.3867 - mae: 90.0052 - val_loss: 13149.4082 - val_mae: 93.5932
Epoch 4/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 997us/step - loss: 13506.0654 - mae: 89.9540 - val_loss: 12942.1504 - val_mae: 85.8868
Epoch 5/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14109.7832 - mae: 91.6018 - val_loss: 13515.2080 - val_mae: 100.3424
Epoch 6/100
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15768.5410 - mae: 97.6272 - val_loss: 37879.5508 - val_mae: 180.8911
E