In [7]:
import pandas as pd
import numpy as np

# loading dataset into the program
dataset1 = pd.read_csv('C:\ML\London 2000-01-01 to 2024-01-31.csv', index_col="datetime")
dataset2 = pd.read_csv(r'C:\ML\tfl-daily-cycle-hires.csv', index_col="Day", parse_dates=["Day"])

# filter dataset1 from 2010.07.30 to 2024.01.31 ?????
start_date = pd.to_datetime('2000-01-01')
end_date = pd.to_datetime('2010-07-29')

dataset1.index = pd.to_datetime(dataset1.index)
dataset1_filtered = dataset1[(dataset1.index < start_date) | (dataset1.index > end_date)]

# combining the two datasets
merged_data = pd.merge(dataset1_filtered, dataset2, left_index=True, right_index=True, how='outer')

# changing index data type into datetime
merged_data.index = pd.to_datetime(merged_data.index)
print(merged_data.index)

# Print with the merged data
pd.set_option('display.max_rows', None)  # display all rows
pd.set_option('display.max_columns', None)  # display all columns
pd.set_option('display.width', None)  # no limitation with width
print(merged_data.head())
print(merged_data.tail())
#print(merged_data)
print(merged_data.columns)

# Checking number of missing data in every column
print(merged_data.isnull().sum())

# checking percentage of missing data in a column
perc_missing_values = merged_data.isnull().sum()/merged_data.shape[0]
print(perc_missing_values)

# Filling in the missing data in columns by the previous day values
merged_data=merged_data.ffill()

# Checking again the number of missing data in every column after filling in the missing values
print(merged_data.isnull().sum())

# Checking datatypes of each column to ensure are of correct types for ML processing
print(merged_data.dtypes)

# Checking how many entries for each year to identify if there is any day weather record is missing
print(merged_data.index.year.value_counts().sort_index())

# print final dataset
final_dataset = merged_data.copy()
print(final_dataset.head())
print(final_dataset.tail())
print(final_dataset.shape)

DatetimeIndex(['2010-07-30', '2010-07-31', '2010-08-01', '2010-08-02',
               '2010-08-03', '2010-08-04', '2010-08-05', '2010-08-06',
               '2010-08-07', '2010-08-08',
               ...
               '2024-01-22', '2024-01-23', '2024-01-24', '2024-01-25',
               '2024-01-26', '2024-01-27', '2024-01-28', '2024-01-29',
               '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', name='datetime', length=4934, freq=None)
              name  tempmax  tempmin  temp  feelslikemax  feelslikemin  \
datetime                                                                 
2010-07-30  london     22.3     14.0  17.9          22.3          14.0   
2010-07-31  london     23.5     17.6  20.1          23.5          17.6   
2010-08-01  london     22.8     15.3  18.8          22.8          15.3   
2010-08-02  london     22.1     16.3  18.9          22.1          16.3   
2010-08-03  london     21.8     13.0  18.0          21.8          13.0   

            

In [8]:
print(final_dataset.head())

              name  tempmax  tempmin  temp  feelslikemax  feelslikemin  \
datetime                                                                 
2010-07-30  london     22.3     14.0  17.9          22.3          14.0   
2010-07-31  london     23.5     17.6  20.1          23.5          17.6   
2010-08-01  london     22.8     15.3  18.8          22.8          15.3   
2010-08-02  london     22.1     16.3  18.9          22.1          16.3   
2010-08-03  london     21.8     13.0  18.0          21.8          13.0   

            feelslike   dew  humidity  precip  precipprob  precipcover  \
datetime                                                                 
2010-07-30       17.9  10.5      63.0   0.000           0         0.00   
2010-07-31       20.1  14.3      71.3   1.995         100         8.33   
2010-08-01       18.8  11.7      64.8   0.000           0         0.00   
2010-08-02       18.9  11.1      60.9   0.299         100         4.17   
2010-08-03       18.0  11.6      67.3

In [9]:
print(final_dataset.tail())

              name  tempmax  tempmin  temp  feelslikemax  feelslikemin  \
datetime                                                                 
2024-01-27  London      8.6      1.1   5.0           6.6          -0.6   
2024-01-28  London     11.5      3.1   7.0          11.5           0.5   
2024-01-29  London     12.1      8.6  10.8          12.1           7.2   
2024-01-30  London     12.5      7.6   9.1          12.5           4.9   
2024-01-31  London     12.1      6.8   8.8          12.1           4.3   

            feelslike  dew  humidity  precip  precipprob  precipcover  \
datetime                                                                
2024-01-27        3.3  2.0      81.8   0.000           0         0.00   
2024-01-28        5.4  4.1      82.4   0.000           0         0.00   
2024-01-29       10.5  8.8      87.3   0.000           0         0.00   
2024-01-30        7.5  5.3      77.5   0.000           0         0.00   
2024-01-31        7.5  4.1      72.5   0.19

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Activation
from keras.optimizers import SGD

np_data = final_dataset.to_numpy()
X = np_data[:, :32]
y = np_data[:, 32]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))
x_scaler = StandardScaler()
x_train = x_scaler.fit_transform(x_train)
y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train)

# These hyperparameters will be decided based on the validation set later
batch_size = 32
epochs = 100
model = Sequential()
model.add(LSTM(64, activation = "relu", input_shape=(32,))) # Not sure about 64
model.add(Dropout(0.2))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation = "linear")) # Not sure about 1
model.compile(loss = "mse", optimizer = SGD(), metrics = ["accuracy"])
model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split = 0.1)

# Model validation to find hyperparameters (corret location?)
y_test_model = model.predict(x_test)
accuracy_score = accuracy_score(y_test,y_test_model)
print(accuracy_score)

# Normalisation
x_test = x_scaler.transform(x_test)
y_test = y_scaler.transform(y_test)

score = model.evaluate(x_test, y_test, verbose = 0)
print('Test_loss:', score[0])
print('Test_accuracy:', score[1])
raw_output = model.predict(x_test)
output = y_scaler.inverse_transform(raw_output)
print(output)


ModuleNotFoundError: No module named 'keras'