In [2]:
# Imports

import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


# Preprocess data

def preprocess_bike_df(df):
    # Date string to datetime object
    df['dteday'] = pd.to_datetime(df['dteday'])

    # Calculate day of the year (1-365)
    df['day_enumerated'] = df['dteday'].dt.dayofyear

    # Adjust day_of_year for second year to continue counting (e.g. Jan 1, 2012 would be 366 instead of 1)
    df.loc[df['dteday'].dt.year == 2012, 'day_enumerated'] += 365

    # Normalize to [0, 4pi]
    df['date_scaled'] = (df['day_enumerated'] / 731) * 4 * np.pi

    # Encode cyclic nature
    df['date_sin'] = np.sin(df['date_scaled'])
    df['date_cos'] = np.cos(df['date_scaled'])

    # Normalize to [0, 2pi]
    df['hour_scaled'] = (df['hr'] / 24) * 2 * np.pi

    # Encode cyclic nature
    df['hour_sin'] = np.sin(df['hour_scaled'])
    df['hour_cos'] = np.cos(df['hour_scaled'])

    df['weekday'] = df['weekday'].astype(str)
    weekday_dummies = pd.get_dummies(df['weekday'], prefix='weekday')

    # Concatenate the original DataFrame with the dummy DataFrame
    df = pd.concat([df, weekday_dummies], axis=1)
    
    # Standardize features and target
    scaler = StandardScaler()
    standardized = ["temp", "atemp", "hum", "windspeed", "cnt"]
    df.loc[:, standardized] = scaler.fit_transform(df[standardized])    
    
    return df[[
        "yr", "date_cos", "date_sin", "hour_cos", "hour_sin",
        "weekday_0", "weekday_1", "weekday_2", "weekday_3", "weekday_4", "weekday_5", "weekday_6",
        "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed", "cnt"
    ]]


def homogeneous_client_slices(df):
    n_tot = len(df)
    n_clients = 8
    ix = np.random.permutation(n_tot)
    n_c = n_tot // n_clients
    
    client_slices = []
    for client in range(n_clients):
        c_slice = np.zeros(n_tot, dtype=bool)
        c_slice[ix[client * n_c : (client + 1) * n_c]] = True
        client_slices.append(c_slice)
        
    return client_slices


def heterogeneous_client_slices(df):
    n_tot = len(df)
    n_clients = 8
    n_c = n_tot // n_clients
    
    client_slices = []
    for client in range(n_clients):
        c_slice = np.zeros(n_tot, dtype=bool)
        c_slice[client * n_c : (client + 1) * n_c] = True
        client_slices.append(c_slice)
        
    return client_slices


def client_split_df(df, client_slices):
    client_dfs = []
    for c_slice in client_slices:
        client_dfs.append(df[c_slice].sample(frac=1))
        
    return client_dfs


def client_dfs_to_tensor(client_dfs):
    client_tensors = [torch.tensor(client_df.values) for client_df in client_dfs]
    return torch.stack(client_tensors)


def dev_test_split(data):
    n_test = 252
    devdata = data[:, None, :-n_test, :]
    testdata = data[:, None, -n_test:, :]
    return devdata, testdata


def train_val_split(data):
    n_cp = 160
    n_cp_val = 2
    n_cols = 20
    cp_data = data.reshape(8, n_cp, -1, n_cols)
    cp_traindata = cp_data[:, :, :-n_cp_val, :]
    cp_valdata = cp_data[:, :n_cp/2, -n_cp_val:, :]
    return cp_traindata, cp_valdata


def X_y_split(data):
    X = data[:, :, :, :-1]
    y = data[:, :, :, -1]
    return X, y


def static_dataset(X, y):
    return X.reshape((8, -1, 19)), y.reshape((8, -1))


def get_clean_data(heterogeneous=True, seed=None):
    np.random.seed(seed)
    df = pd.read_csv("/proj/fair-ai/fair-fl/data/bike.csv")
    df_Xy = preprocess_bike_df(df)
    client_slices = heterogeneous_client_slices(df) if heterogeneous else homogeneous_client_slices(df)
    client_dfs = client_split_df(df_Xy, client_slices)
    data = client_dfs_to_tensor(client_dfs)
    devdata, testdata = dev_test_split(data)
    traindata, valdata = train_val_split(devdata)
    
    return X_y_split(traindata), static_dataset(*X_y_split(valdata)), static_dataset(*X_y_split(testdata))