In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/horizontal/covid/owid-covid-data.csv")

In [48]:
data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [8]:
import pickle
from pathlib import Path

#loading covid data
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.preprocessing import StandardScaler


def _get_samples_from_covid_data(n: int, attributes: list[str], num_of_samples: int, serialize: bool):
    """
    Generates samples from the covid dataset.
    Args:
        n (int): Numbers of records per sample.
        attributes (list[str]): List of attributes that will be used from the dataset. The fist element is the endogene variable.
        num_of_samples (int): Number of returned samples.
    """
    
    #load data
    data = pd.read_csv("../datasets/horizontal/covid/owid-covid-data.csv")

    #load data if already serialized
    path = Path(f'../datasets/samples/covid_{n}_{"_".join(attributes)}.pkl')
    if path.exists():
        pkl_file = open(path, 'rb')
        x_data, y_data = pickle.load(pkl_file)
        pkl_file.close()
        
        if len(data.index) > num_of_samples:
            return x_data[:num_of_samples], y_data[:num_of_samples]

        else:
            return x_data[:len(data.index) - 1], y_data[:len(data.index) - 1]

    
    #scale the data
    record_info = data[["iso_code", "continent", "location", "date", "tests_units"]]
    data = data.drop(["iso_code", "continent", "location", "date", "tests_units"], axis=1)
    data_columns = data.columns
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    data = pd.DataFrame(scaled_data, columns=data_columns)
    data = pd.concat([data, record_info], axis=1)

    
    x_data = []
    y_data = []

    #split the data
    countries = data.iso_code.drop_duplicates(keep="first")
    countries = countries[countries != "ESH"] #drop ESH because it only has one entry

    for country in countries:
        country_data = data[data.iso_code == country]

        splitter = SlidingWindowSplitter(fh=1, window_length=n)
        samples = splitter.split_series(country_data[attributes].to_numpy())

        for sample in samples:
            x_sample = sample[0].flatten()
            y_sample = sample[1].flatten()[attributes.index("new_cases")]

            if not np.isnan(np.sum(x_sample)) and not np.isnan(y_sample): #check for nans
                    x_data.append(x_sample)
                    y_data.append(y_sample)
   
    #save data
    if serialize:
            output = open(path, "wb")
            pickle.dump((x_data, y_data), output)
            output.close()

    #enusre that sample number is not out of range
    if len(data.index) > num_of_samples:
        return x_data[:num_of_samples], y_data[:num_of_samples]
    else:
        return x_data[:len(data.index) - 1], y_data[:len(data.index) - 1]


In [21]:
test_samples = _get_samples_from_covid_data(10, ["new_cases"], 100, True)

In [22]:
test_samples[0][:3]

[array([-0.1447642 , -0.14482342, -0.14482342, -0.14482342, -0.14482342,
        -0.14482342, -0.14482342, -0.14482342, -0.14482342, -0.14482342]),
 array([-0.14482342, -0.14482342, -0.14482342, -0.14482342, -0.14482342,
        -0.14482342, -0.14482342, -0.14482342, -0.14482342, -0.14482342]),
 array([-0.14482342, -0.14482342, -0.14482342, -0.14482342, -0.14482342,
        -0.14482342, -0.14482342, -0.14482342, -0.14482342, -0.14482342])]

In [23]:
test_samples[1][:3]

[-0.14482341997863235, -0.14482341997863235, -0.14478789054959926]

_Weather data starts here_

In [2]:
data = pd.read_csv("/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/vertical/weather/berlin_alexanderplatz.csv", names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
data.describe()

Unnamed: 0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
count,16515.0,16513.0,16514.0,11987.0,4117.0,10598.0,11034.0,8395.0,11936.0,31.0,0.0
mean,10.608653,7.365076,14.120358,1.579361,5.176099,208.35167,28.778204,53.423204,1010.997009,15.483871,
std,7.861329,6.997318,8.975399,3.642694,23.663484,93.444179,15.034219,21.76871,9.549732,60.651942,
min,-17.6,-18.7,-14.9,0.0,0.0,0.0,0.0,0.0,961.3,0.0,
25%,4.6,2.2,7.1,0.0,0.0,131.0,16.6,36.0,1005.2,0.0,
50%,10.6,7.4,14.1,0.0,0.0,235.0,26.6,51.5,1011.3,0.0,
75%,16.8,13.0,21.1,1.5,0.0,280.0,38.9,67.7,1017.2,0.0,
max,31.1,24.7,38.0,60.4,310.0,360.0,115.2,154.8,1044.8,276.0,


In [3]:

import sys
sys.path.insert(1, '../scripts')
import helper

2023-01-26 15:40:54.771133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-26 15:40:56.375555: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-26 15:40:56.377150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [9]:
from sktime.forecasting.model_selection import SlidingWindowSplitter


def _get_samples_from_weather_data(n: int, attributes: list[str], station: str, num_of_samples: int):
    """
    Generates samples from the covid dataset.
    Args:
        n (int): Numbers of records per sample.
        attributes (list[str]): List of attributes that will be used from the dataset. The fist element is the endogene variable.
        station (str): Name of the station.
        num_of_samples (int): Number of returned samples.
    """



    #load data
    path = f"/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/vertical/weather/{station}.csv"
    data = pd.read_csv(path, names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
    
    #scale the data
    data = data.drop("time", axis=1)
    data_columns = data.columns
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    data = pd.DataFrame(scaled_data, columns=data_columns)

    #split the data
    splitter = SlidingWindowSplitter(fh=1, window_length=n)
    samples = splitter.split_series(data[attributes].to_numpy())

    x_data = []
    y_data = []

    for sample in samples:
        x_sample = sample[0].flatten()
        y_sample = sample[1].flatten()[0] #the endogene temperature variable

        if not np.isnan(np.sum(x_sample)) and not np.isnan(y_sample): #check for nans
            x_data.append(x_sample)
            y_data.append(y_sample)

    return x_data[:num_of_samples], y_data[:num_of_samples]



In [10]:
x_train, y_train = _get_samples_from_weather_data(10, ["temp"], "berlin_alexanderplatz", 10000)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [6]:
pd_x = pd.DataFrame(x_train)
pd_x.describe()
x_train[0]

NameError: name 'x_train' is not defined

In [70]:
pd_y = pd.Series(y_train)
pd_y.describe()
y_train[0]

-1.285909696369293

In [71]:
old_x_train[0]

array([-1.3,  1.5, -0.5, -3.2, -2.1, -2.4, -3.2, -3.8, -3.4,  0.4])

In [2]:
import tensorflow as tf

2023-01-03 18:59:26.252891: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-03 18:59:26.695633: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-03 18:59:26.695667: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-03 18:59:27.812762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [3]:
path = "/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/vertical/weather/muenchen.csv"

In [4]:
data = pd.read_csv(path, names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
data.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,1879-01-01,,4.6,9.8,0.5,,,,,,,
1,1879-01-02,,0.7,8.5,2.8,,,,,,,
2,1879-01-03,,-0.9,8.6,0.7,,,,,,,
3,1879-01-04,,0.4,8.0,1.0,,,,,,,
4,1879-01-05,,-4.7,0.0,5.9,,,,,,,


In [5]:
exog_columns = ["temp", "pres", "tsun"]
edog_column = "temp"

In [9]:
tf_dataset = tf.data.Dataset.from_tensor_slices(dict(data))

In [55]:
exog = None
edog = None
x_train = []
y_train = []
for window in tf_dataset.batch(11, drop_remainder=True).take(10000):
    new_df = pd.DataFrame(window)
    exog = new_df[exog_columns].iloc[:10]
    edog = new_df["time"].iloc[-1]

    if not exog.isnull().values.any() or not edog: #check for NaNs
        exog = exog.to_numpy().flatten()


        x_train.append(exog)
        y_train.append(edog)
    #     exog = None
    #     edog = None

print(y_train[0:2])

[b'1985-01-15', b'1985-01-26']


## LSTM model

In [17]:
samples = helper.get_samples("weather", 10, ["temp"], "potsdam", True, 10000)

In [16]:
import tensorflow as tf

In [38]:
model = tf.keras.Sequential()
input_shape = np.array(samples[0]).shape[1]

#add layers
model.add(tf.keras.layers.Input(shape=(input_shape,)))
model.add(tf.keras.layers.Conv1D(32, kernel_size=(10,)))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64))
#output layer
model.add(tf.keras.layers.Dense(1))

model.compile(optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.MeanAbsoluteError(),
        metrics=["mean_absolute_error"])

model.summary()

ValueError: Input 0 of layer "conv1d_8" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (None, 10)

In [29]:
model.fit(np.array(samples[0]), np.array(samples[1]))

InvalidArgumentError: Graph execution error:

Detected at node 'sequential_6/embedding/embedding_lookup' defined at (most recent call last):
    File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/florian/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 711, in start
      self.io_loop.start()
    File "/home/florian/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "/home/florian/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/florian/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_1226/3115605718.py", line 1, in <module>
      model.fit(np.array(samples[0]), np.array(samples[1]))
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
      y_pred = self(x, training=True)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/sequential.py", line 413, in call
      return super().call(inputs, training=training, mask=mask)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/florian/.local/lib/python3.10/site-packages/keras/layers/core/embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential_6/embedding/embedding_lookup'
indices[1,0] = -1 is not in [0, 2)
	 [[{{node sequential_6/embedding/embedding_lookup}}]] [Op:__inference_train_function_3712]