In [21]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/vertical/covid/owid-covid-data.csv")

In [3]:
data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [45]:
import pickle
from pathlib import Path

#loading covid data
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.preprocessing import StandardScaler


def _get_samples_from_covid_data(n: int, attributes: list[str], num_of_samples: int, serialize: bool):
    """
    Generates samples from the covid dataset.
    Args:
        n (int): Numbers of records per sample.
        attributes (list[str]): List of attributes that will be used from the dataset. The fist element is the endogene variable.
        num_of_samples (int): Number of returned samples.
    """

    #load data
    data = pd.read_csv("../datasets/vertical/covid/owid-covid-data.csv")

    #load data if already serialized
    path = Path(f'../datasets/samples/covid_{n}_{"_".join(attributes)}.pkl')
    if path.exists():
        pkl_file = open(path, 'rb')
        x_data, y_data = pickle.load(pkl_file)
        pkl_file.close()
        
        if len(data.index) > num_of_samples:
            return x_data[:num_of_samples], y_data[:num_of_samples]

        else:
            return x_data[:len(data.index) - 1], y_data[:len(data.index) - 1]

    #fill nan with 0
    data[attributes] = data[attributes].fillna(0)
    data["new_cases"] = data["new_cases"].fillna(0)


    # #scale the data
    record_info = data[["iso_code", "continent", "location", "date", "tests_units"]]

    #scale selected attributes
    selected_data = data[attributes]
    selected_data_columns = selected_data.columns
    scaler = StandardScaler()
    scaled_selected_data = scaler.fit_transform(selected_data)
    selected_data = pd.DataFrame(scaled_selected_data, columns=selected_data_columns)

    #scale 'new_cases' (target)
    if not "new_cases" in attributes:
        new_cases_data = data[["new_cases"]]
        scaler = StandardScaler()
        scaled_new_cases_data = scaler.fit_transform(new_cases_data)
        new_cases_data = pd.DataFrame(scaled_new_cases_data, columns=["new_cases"])

        #combine scaled data
        data = pd.concat([selected_data, new_cases_data, record_info], axis=1)

    else:
        data = pd.concat([selected_data, record_info], axis=1)



    
    x_data = []
    y_data = []

    #split the data
    countries = data.iso_code.drop_duplicates(keep="first")
    countries = countries[countries != "ESH"] #drop ESH because it only has one entry

    for country in countries:
        country_data = data[data.iso_code == country]

        #generate input values
        splitter = SlidingWindowSplitter(fh=1, window_length=n)
        x_samples = splitter.split_series(country_data[attributes].to_numpy())

        for sample in x_samples:
            x_sample = sample[0].flatten()

            if not np.isnan(np.sum(x_sample)): #check for nans
                    x_data.append(x_sample)


        #generate target values
        y_samples = splitter.split_series(country_data["new_cases"].to_numpy())

        for sample in y_samples:
            y_sample = sample[1].flatten()[0]

            if not np.isnan(y_sample): #check for nans
                y_data.append(y_sample)


    #save data
    if serialize:
            output = open(path, "wb")
            pickle.dump((x_data, y_data), output)
            output.close()

    #enusre that sample number is not out of range
    if len(data.index) > num_of_samples:
        return x_data[:num_of_samples], y_data[:num_of_samples]
    else:
        return x_data[:len(data.index) - 1], y_data[:len(data.index) - 1]


In [49]:
test_samples = _get_samples_from_covid_data(10, ["new_cases"], 100000000, True)

In [50]:
len(test_samples[0])

232956

In [48]:
len(test_samples[1])

232956

In [15]:
import tensorflow as tf
#load data
data = []
targets = []
for attribute in ["new_cases", "weekly_hosp_admissions"]:
    if targets == []:
        X, y = helper.get_samples("covid", 10, [attribute], "", True, max_samples=100000, standardize=True)
        targets = tf.data.Dataset.from_tensor_slices(y).batch(1000)

        tf_dataset = tf.data.Dataset.from_tensor_slices(X).batch(1000)
        data.append(tf_dataset)

    else:
        X, _ = helper.get_samples("covid", 10, [attribute], "", True, max_samples=100000, standardize=True)
        tf_dataset = tf.data.Dataset.from_tensor_slices(X).batch(1000)
        data.append(tf_dataset)

#add targets as last entry
data.append(targets)

debug_var = data

tf_dataset = tf.data.Dataset.zip(tuple(data))

In [20]:
i = True
for batch in tf_dataset:
    if i:
        print(batch[-1])
        i = False


tf.Tensor(
[-0.14054086 -0.14054086 -0.1405043  -0.14054086 -0.14054086 -0.14054086
 -0.1405043  -0.14054086 -0.14054086 -0.1405043  -0.14046773 -0.14047992
 -0.14052867 -0.14054086 -0.14054086 -0.14054086 -0.14054086 -0.14041897
 -0.14046773 -0.14051649 -0.1401508  -0.14046773 -0.14040678 -0.14035802
 -0.14044335 -0.14054086 -0.13990701 -0.14022394 -0.14001672 -0.14012642
 -0.14052867 -0.14018737 -0.14007767 -0.14017518 -0.13985826 -0.14028489
 -0.13960228 -0.14054086 -0.14012642 -0.13990701 -0.13983388 -0.13926098
 -0.14024832 -0.1399192  -0.13977293 -0.14023613 -0.13977293 -0.14017518
 -0.13973636 -0.13951695 -0.13993139 -0.13927316 -0.13891967 -0.13971198
 -0.13844428 -0.13902938 -0.14054086 -0.1363477  -0.13690842 -0.14054086
 -0.14054086 -0.14054086 -0.13133785 -0.13849304 -0.13845647 -0.14054086
 -0.13031394 -0.13734724 -0.13684747 -0.14054086 -0.1360064  -0.13183762
 -0.13628676 -0.13770073 -0.13521409 -0.13343443 -0.13456805 -0.13406828
 -0.13395858 -0.13156945 -0.13181324 -0.

_Weather data starts here_

In [2]:
data = pd.read_csv("/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/horizontal/weather/berlin_alexanderplatz.csv", names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
data.describe()

Unnamed: 0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
count,16515.0,16513.0,16514.0,11987.0,4117.0,10598.0,11034.0,8395.0,11936.0,31.0,0.0
mean,10.608653,7.365076,14.120358,1.579361,5.176099,208.35167,28.778204,53.423204,1010.997009,15.483871,
std,7.861329,6.997318,8.975399,3.642694,23.663484,93.444179,15.034219,21.76871,9.549732,60.651942,
min,-17.6,-18.7,-14.9,0.0,0.0,0.0,0.0,0.0,961.3,0.0,
25%,4.6,2.2,7.1,0.0,0.0,131.0,16.6,36.0,1005.2,0.0,
50%,10.6,7.4,14.1,0.0,0.0,235.0,26.6,51.5,1011.3,0.0,
75%,16.8,13.0,21.1,1.5,0.0,280.0,38.9,67.7,1017.2,0.0,
max,31.1,24.7,38.0,60.4,310.0,360.0,115.2,154.8,1044.8,276.0,


In [1]:

import sys
sys.path.insert(1, '../scripts')
import helper

2023-02-12 14:48:59.003338: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-12 14:49:00.324999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-12 14:49:00.325413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

#load test dataset
all_stations = ["berlin_alexanderplatz", "frankfurt_am_main_westend", "hamburg_airport", "leipzig", "muenchen", "potsdam", "hannover", "koeln_bonn_airport", "stuttgart_schnarrenberg", "weimar"]
X_test_list = []
y_test_list = []

for station in all_stations:
    X, y = helper.get_samples(
            "weather", 10, ["temp", "dwpt", "rhum"], station, False, 10000, True)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_test_list.append(X_test)
    y_test_list.append(y_test)

    print(station)
    print(f"y_test: {np.isnan(np.sum(y_test))}")
    print(f"X_test: {np.isnan(np.sum(X_test))}")
X_test_list = np.array(X_test_list).flatten().reshape((20000, 30))
y_test_list = np.array(y_test_list).flatten()

berlin_alexanderplatz
y_test: False
X_test: False
frankfurt_am_main_westend
y_test: False
X_test: False
hamburg_airport
y_test: False
X_test: False
leipzig
y_test: False
X_test: False
muenchen
y_test: False
X_test: False
potsdam
y_test: False
X_test: False
hannover
y_test: False
X_test: False
koeln_bonn_airport
y_test: False
X_test: False
stuttgart_schnarrenberg
y_test: False
X_test: False
weimar
y_test: False
X_test: False


In [5]:
X_test_list.shape

(20000, 30)

In [43]:
import pickle
test_dataset_weather = (X_test_list, y_test_list)
output = open("../datasets/horizontal/weather/weather_test_dataset.pkl", "wb")
pickle.dump(test_dataset_weather, output)
output.close()

In [2]:
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


def _get_samples_from_weather_data(n: int, attributes: list[str], station: str, num_of_samples: int):
    """
    Generates samples from the covid dataset.
    Args:
        n (int): Numbers of records per sample.
        attributes (list[str]): List of attributes that will be used from the dataset. The fist element is the endogene variable.
        station (str): Name of the station.
        num_of_samples (int): Number of returned samples.
    """



    #load data
    path = f"/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/horizontal/weather/{station}.csv"
    data = pd.read_csv(path, names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
    
    #scale the data
    # data = data.drop("time", axis=1)
    # data_columns = data.columns
    # scaler = StandardScaler()
    # scaled_data = scaler.fit_transform(data)
    # data = pd.DataFrame(scaled_data, columns=data_columns)

    #split the data
    splitter = SlidingWindowSplitter(fh=1, window_length=n)
    samples = splitter.split_series(data[attributes].to_numpy())

    x_data = []
    y_data = []

    for sample in samples:
        x_sample = sample[0].flatten()
        y_sample = sample[1].flatten()[0] #the endogene temperature variable

        if not np.isnan(np.sum(x_sample)) and not np.isnan(y_sample): #check for nans
            x_data.append(x_sample)
            y_data.append(y_sample)

    return x_data[:num_of_samples], y_data[:num_of_samples]



In [8]:
x_train, y_train = _get_samples_from_weather_data(10, ["temp"], "berlin_alexanderplatz", 10000)

In [9]:
import statsmodels.api as sm

ols = sm.OLS(y_train, x_train)
result = ols.fit()
result.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.972
Model:,OLS,Adj. R-squared (uncentered):,0.972
Method:,Least Squares,F-statistic:,34100.0
Date:,"Thu, 02 Feb 2023",Prob (F-statistic):,0.0
Time:,13:32:59,Log-Likelihood:,-22019.0
No. Observations:,10000,AIC:,44060.0
Df Residuals:,9990,BIC:,44130.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0538,0.010,5.392,0.000,0.034,0.073
x2,-0.0071,0.014,-0.492,0.623,-0.035,0.021
x3,0.0265,0.015,1.799,0.072,-0.002,0.055
x4,0.0045,0.015,0.304,0.761,-0.024,0.033
x5,0.0253,0.015,1.716,0.086,-0.004,0.054
x6,0.0019,0.015,0.127,0.899,-0.027,0.031
x7,0.0140,0.015,0.949,0.343,-0.015,0.043
x8,0.1031,0.015,7.003,0.000,0.074,0.132
x9,-0.2769,0.014,-19.140,0.000,-0.305,-0.249

0,1,2,3
Omnibus:,166.351,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,329.485
Skew:,-0.035,Prob(JB):,2.8400000000000004e-72
Kurtosis:,3.886,Cond. No.,42.8


In [70]:
pd_y = pd.Series(y_train)
pd_y.describe()
y_train[0]

-1.285909696369293

In [71]:
old_x_train[0]

array([-1.3,  1.5, -0.5, -3.2, -2.1, -2.4, -3.2, -3.8, -3.4,  0.4])

In [2]:
import tensorflow as tf

2023-01-03 18:59:26.252891: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-03 18:59:26.695633: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-03 18:59:26.695667: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-03 18:59:27.812762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [3]:
path = "/home/florian/bachelorarbeit/code/Cross-Silo-FL/datasets/horizontal/weather/muenchen.csv"

In [4]:
data = pd.read_csv(path, names=["time", "temp", "dwpt", "rhum", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun", "coco"])
data.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,1879-01-01,,4.6,9.8,0.5,,,,,,,
1,1879-01-02,,0.7,8.5,2.8,,,,,,,
2,1879-01-03,,-0.9,8.6,0.7,,,,,,,
3,1879-01-04,,0.4,8.0,1.0,,,,,,,
4,1879-01-05,,-4.7,0.0,5.9,,,,,,,


In [5]:
exog_columns = ["temp", "pres", "tsun"]
edog_column = "temp"

In [9]:
tf_dataset = tf.data.Dataset.from_tensor_slices(dict(data))

In [55]:
exog = None
edog = None
x_train = []
y_train = []
for window in tf_dataset.batch(11, drop_remainder=True).take(10000):
    new_df = pd.DataFrame(window)
    exog = new_df[exog_columns].iloc[:10]
    edog = new_df["time"].iloc[-1]

    if not exog.isnull().values.any() or not edog: #check for NaNs
        exog = exog.to_numpy().flatten()


        x_train.append(exog)
        y_train.append(edog)
    #     exog = None
    #     edog = None

print(y_train[0:2])

[b'1985-01-15', b'1985-01-26']
