In [83]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt

#Step 1: Load your unscaled weather observations and "pleasant weather" data
path = '/Users/andrewfearney27/Downloads/'
output_dir = os.path.join(path, '01 Data')
cleaned_data_path = os.path.join(output_dir, 'climate_nodate_cleaned.csv')

#Load data
weather_unscaled_path = os.path.join(path, 'Dataset-weather-prediction-dataset-scaled.csv')
pleasant_weather_path = os.path.join(path, 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv')

weather_data = pd.read_csv(weather_unscaled_path)
pleasant_weather = pd.read_csv(pleasant_weather_path)

#Convert all columns to lowercase and strip leading/trailing spaces to avoid errors
weather_data.columns = weather_data.columns.str.lower().str.strip()

# Step 2: Clean the data
# Define columns to drop
columns_to_drop = ['date', 'month', 'gdansk_cloud_cover', 'gdansk_humidity', 'gdansk_precipitation', 'gdansk_snow_depth',
                   'roma_cloud_cover', 'roma_wind_speed', 'tours_pressure', 'tours_humidity', 'tours_snow_depth']

#Drop columns present in the dataset 
columns_to_drop = [col for col in columns_to_drop if col in weather_data.columns]
weather_cleaned = weather_data.drop(columns=columns_to_drop, axis=1)

#Drop any remaining snow depth columns if present
weather_cleaned = weather_cleaned.loc[:, ~weather_cleaned.columns.str.endswith('_snow_depth')]


In [84]:
#Step 3: Handle missing data
# Fill missing data from nearby stations, using stations like Ljubljana, Sonnblick, and Oslo as substitutes
if 'kassel_wind_speed' in weather_cleaned.columns and 'ljubljana_cloud_cover' in weather_cleaned.columns:
    position1 = weather_cleaned.columns.get_loc('kassel_wind_speed') - 1
    weather_cleaned.insert(position1, 'kassel_cloud_cover', weather_cleaned['ljubljana_cloud_cover'])

if 'munchenb_humidity' in weather_cleaned.columns and 'sonnblick_wind_speed' in weather_cleaned.columns:
    position2 = weather_cleaned.columns.get_loc('munchenb_humidity') - 1
    weather_cleaned.insert(position2, 'munchenb_wind_speed', weather_cleaned['sonnblick_wind_speed'])

if 'stockholm_pressure' in weather_cleaned.columns and 'oslo_humidity' in weather_cleaned.columns:
    position3 = weather_cleaned.columns.get_loc('stockholm_pressure') - 1
    weather_cleaned.insert(position3, 'stockholm_humidity', weather_cleaned['oslo_humidity'])


In [85]:
#Step 4: Export the cleaned data
weather_cleaned.to_csv(cleaned_data_path, index=False)

In [86]:
#Step 5: Create 'X' and 'y' matrix
#Ensure columns are cleaned again
weather_cleaned.columns = weather_cleaned.columns.str.lower().str.strip()

In [87]:
# Drop 'date' column safely if it exists in pleasant_weather
pleasant_weather.columns = pleasant_weather.columns.str.lower().str.strip()
if 'date' in pleasant_weather.columns:
    y = pleasant_weather.drop(columns=['date'], axis=1).values
else:
    y = pleasant_weather.values

# Use cleaned weather data for X matrix
X = weather_cleaned.values

# Ensure X and y have the same length
min_length = min(len(X), len(y))
X = X[:min_length]
y = y[:min_length]

# Reshape X into (n_samples, 15, 9) and y into (n_samples, 15)
X = X.reshape(-1, 15, 9)
y = y.reshape(-1, 15)

In [88]:
#Step 6
#Ensure X and y have the same length
min_length = min(len(X), len(y))
X = X[:min_length]
y = y[:min_length]

print(f"Adjusted X shape: {X.shape}")
print(f"Adjusted y shape: {y.shape}")

# Reshape X into (n_samples, 15, 9) and y into (n_samples, 15)
X = X.reshape(-1, 15, 9)
y = y.reshape(-1, 15)

# Now proceed with splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Adjusted X shape: (22950, 15, 9)
Adjusted y shape: (22950, 15)


In [89]:
#Step 7: Build the CNN model
epochs = 5
batch_size = 16
n_hidden = 2

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(n_classes, activation='sigmoid'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [90]:
#Step 8: Train the model
model.fit(X_train, y_train, batch_size=batch_size, validation_data=(X_test, y_test), epochs=epochs)


Epoch 1/5
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 330us/step - accuracy: 0.0975 - loss: 11.7364 - val_accuracy: 0.1028 - val_loss: 13.2078
Epoch 2/5
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305us/step - accuracy: 0.0919 - loss: 18.2314 - val_accuracy: 0.0865 - val_loss: 17.1887
Epoch 3/5
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278us/step - accuracy: 0.0769 - loss: 20.9504 - val_accuracy: 0.0776 - val_loss: 21.9587
Epoch 4/5
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279us/step - accuracy: 0.0714 - loss: 23.8693 - val_accuracy: 0.0741 - val_loss: 27.3477
Epoch 5/5
[1m1148/1148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279us/step - accuracy: 0.0792 - loss: 29.1329 - val_accuracy: 0.0784 - val_loss: 34.2420


<keras.src.callbacks.history.History at 0x31fb87d50>

In [91]:
#Step 9: Create confusion matrix
predictions = {0: 'basel', 1: 'belgrade', 2: 'budapest', 3: 'debilt', 4: 'dusseldorf', 5: 'heathrow', 
               6: 'kassel', 7: 'ljubljana', 8: 'maastricht', 9: 'madrid', 10: 'munchenb', 11: 'oslo', 
               12: 'sonnblink', 13: 'stockholm', 14: 'valentia'}

def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([predictions[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([predictions[y] for y in np.argmax(Y_pred, axis=1)])
    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 264us/step
Pred        basel  belgrade  madrid
True                               
basel        2220         5     730
belgrade      650         5     224
budapest      126         0      36
debilt         49         1      14
dusseldorf     18         0       7
heathrow       49         0      18
kassel          7         0       2
ljubljana      36         0      10
maastricht      6         0       1
madrid        270         1      89
munchenb        6         0       2
oslo            2         0       2
stockholm       3         0       0
valentia        1         0       0
