# Importing libraries

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from numpy import unique
from numpy import reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv2D, Dense, Dropout, BatchNormalization, Flatten, MaxPooling1D, TimeDistributed
from tensorflow.keras.utils import to_categorical

# Importing data

In [7]:
path = r'C:\Users\thoma\OneDrive\Dokumente\data analytics\ML_Ach\ClimateWins'

In [8]:
path

'C:\\Users\\thoma\\OneDrive\\Dokumente\\data analytics\\ML_Ach\\ClimateWins'

In [9]:
Weather = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'Weather_unsc_clean.csv'))
Pleasant_weather = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'Pleasant_weather.pkl'))

In [10]:
Weather.head()

Unnamed: 0.1,Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,...,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressure,STOCKHOLM_humidity
0,0,7,0.85,1.018,0.32,0.09,0.7,6.5,0.8,10.9,...,1.0003,0.45,0.34,4.7,8.5,6.0,10.9,8,1.0304,0.98
1,1,6,0.84,1.018,0.36,1.05,1.1,6.1,3.3,10.1,...,1.0007,0.25,0.84,0.7,8.9,5.6,12.1,6,1.0292,0.62
2,2,8,0.9,1.018,0.18,0.3,0.0,8.5,5.1,9.9,...,1.0096,0.17,0.08,0.1,10.5,8.1,12.9,8,1.032,0.69
3,3,3,0.92,1.018,0.58,0.0,4.1,6.3,3.8,10.6,...,1.0184,0.13,0.98,0.0,7.4,7.3,10.6,6,1.0443,0.98
4,4,6,0.95,1.018,0.65,0.14,5.4,3.0,-0.7,6.0,...,1.0328,0.46,0.0,5.7,5.7,3.0,8.4,7,1.043,0.96


In [11]:
Weather = Weather.drop(columns=['Unnamed: 0'], errors='ignore')

In [12]:
Weather.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressure,STOCKHOLM_humidity
0,7,0.85,1.018,0.32,0.09,0.7,6.5,0.8,10.9,1,...,1.0003,0.45,0.34,4.7,8.5,6.0,10.9,8,1.0304,0.98
1,6,0.84,1.018,0.36,1.05,1.1,6.1,3.3,10.1,6,...,1.0007,0.25,0.84,0.7,8.9,5.6,12.1,6,1.0292,0.62
2,8,0.9,1.018,0.18,0.3,0.0,8.5,5.1,9.9,6,...,1.0096,0.17,0.08,0.1,10.5,8.1,12.9,8,1.032,0.69
3,3,0.92,1.018,0.58,0.0,4.1,6.3,3.8,10.6,8,...,1.0184,0.13,0.98,0.0,7.4,7.3,10.6,6,1.0443,0.98
4,6,0.95,1.018,0.65,0.14,5.4,3.0,-0.7,6.0,8,...,1.0328,0.46,0.0,5.7,5.7,3.0,8.4,7,1.043,0.96


In [13]:
# Creating an index
station_cols = Weather.columns

In [14]:
# Creating a DataFrame for stations & obs types
col_info = (pd.Series(station_cols).str.split('_', n=1, expand=True).rename(columns={0: 'station', 1: 'obs'}))

In [15]:
# Defining the correct obs order
obs_order = sorted(col_info['obs'].unique())
station_order = sorted(col_info['station'].unique())

In [16]:
obs_order

['cloud_cover',
 'global_radiation',
 'humidity',
 'precipitation',
 'pressure',
 'sunshine',
 'temp_max',
 'temp_mean',
 'temp_min']

In [17]:
# Rebuilding the column order
ordered_cols = [f"{station}_{obs}"
    for station in station_order
    for obs in obs_order
        ]
Weather_final = Weather[ordered_cols]

In [18]:
# Creating a NumPy array for X
X = Weather_final.to_numpy()
X.shape

(22950, 135)

In [19]:
# Reshaping
X = X.reshape(-1, 15, 9)
X.shape

(22950, 15, 9)

# -> y (labels) doesn't need to be transformed or reshaped

In [20]:
# Creating a NumPy array for y
y = Pleasant_weather.to_numpy()
y.shape

(22950, 15)

In [21]:
# Creating a 3D object
X = X.reshape(-1, 15, 9)

In [22]:
# Splitting the data, while preserving the temporal aspect/dependancy by 'not shuffling'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [23]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(18360, 15, 9) (4590, 15, 9)
(18360, 15) (4590, 15)


In [24]:
len(X_train[0])

15

In [25]:
len(X_train[0][0])

9

# Convolutional Neural Network - setting up parameters

In [26]:
# Modelling and setting up parameters
epochs = 10
batch_size = 16
n_hidden = 8

# Input shape
timesteps = X_train.shape[1]
input_dim = X_train.shape[2]
n_classes = len(y_train[0])

# Building the model
model = Sequential()
model.add(Conv1D(filters=n_hidden, kernel_size=2, activation='relu', padding='same', input_shape=(timesteps, input_dim)))
model.add(MaxPooling1D(pool_size=1))
model.add(TimeDistributed(Dense(16, activation='relu')))
model.add(TimeDistributed(Dense(n_classes, activation='softmax')))

# Computing loss, choosing gradient descent type & metric
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
model.fit(X_train, y_train, batch_size=16, epochs=10, verbose=1)
acc = model.evaluate(X_test, y_test)
print('Loss:', acc[0], 'Accuracy', acc[1])

Epoch 1/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.8519 - loss: 0.4419
Epoch 2/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9206 - loss: 0.1761
Epoch 3/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9387 - loss: 0.1394
Epoch 4/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9500 - loss: 0.1164
Epoch 5/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.9570 - loss: 0.1031
Epoch 6/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9631 - loss: 0.0920
Epoch 7/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9682 - loss: 0.0807
Epoch 8/10
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.9730 - loss: 0.0705
Epoch 9/10
[1m1148/114

# -> The model is converging - accuracy is increasing to about 98% in epoche 10, which seems pretty good, while loss kept decreasing from 0.44 to about 0.06

In [28]:
# Defining function 'confusion_matrix()'
def confusion_matrix(Y_true, Y_pred):
    
    Y_true_series = pd.Series(Y_true, name='True')
    Y_pred_series = pd.Series(Y_pred, name='Pred')
    
    return pd.crosstab(Y_true_series, Y_pred_series)

In [29]:
# Predicting
y_pred = np.argmax(model.predict(X_test), axis=2)

# Flattening across stations
y_true_flat = y_test.reshape(-1)
y_pred_flat = y_pred.reshape(-1)

# Global confusion matrix
cm = confusion_matrix(y_true_flat, y_pred_flat)
print(cm)

[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
Pred      0      1
True              
0     52322    788
1       696  15044


# Recall rate: about 94% for '1' & about 99% for '0'
# Precision rate: about 98% for '1' & about 98% for '0'

In [30]:
# Confusion matrices for each station
station_confusion_matrices = {}

for station_idx, station_name in enumerate(station_order):
    y_true_station = y_test[:, station_idx]
    y_pred_station = y_pred[:, station_idx]

    cm = confusion_matrix(y_true_station, y_pred_station)
    station_confusion_matrices[station_name] = cm

    print(f"\nConfusion matrix for {station_name}:")
    print(cm)


Confusion matrix for BASEL:
Pred     0     1
True            
0     3222    76
1       44  1248

Confusion matrix for BELGRADE:
Pred     0     1
True            
0     2757    86
1       52  1695

Confusion matrix for BUDAPEST:
Pred     0     1
True            
0     3093    45
1       63  1389

Confusion matrix for DEBILT:
Pred     0    1
True           
0     3502   44
1       67  977

Confusion matrix for DUSSELDORF:
Pred     0     1
True            
0     3403    55
1       89  1043

Confusion matrix for HEATHROW:
Pred     0     1
True            
0     3380    70
1       69  1071

Confusion matrix for KASSEL:
Pred     0    1
True           
0     4270   14
1       19  287

Confusion matrix for LJUBLJANA:
Pred     0     1
True            
0     3087    80
1       49  1374

Confusion matrix for MAASTRICHT:
Pred     0     1
True            
0     3389    49
1       54  1098

Confusion matrix for MADRID:
Pred     0     1
True            
0     2280    61
1       38  2211

Confusion m

# → Valentia & Sonnblick seem to only have '0' values, something I’d mention to managers/stakeholders and verify