# All Imports

In [None]:
import os
import glob
import time
import random
import numpy as np
import scipy as sp
import pandas as pd
print ("Numpy:",np.__version__ , "Scipy:",sp.__version__, "Pandas",pd.__version__)

#pytorch and pytorch lightning API
import torch
# import torchmetrics
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.cuda as cuda
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
from datetime import datetime
print ("Pytorch Normal:", torch.__version__)
print ("Pytorch Lightning:", pl.__version__)

#weights and biases for monitoring DL training progress 
import wandb
from pytorch_lightning.loggers import WandbLogger

#plotting
from IPython import display
# from matplotlib import pyplot as plt
# import matplotlib.cm as cm
# from matplotlib.colors import ListedColormap
# import plotly.express as px
# import plotly.figure_factory as ff
# import plotly.graph_objs as go

#sklearn
# from sklearn.decomposition import PCA 
# from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans
# from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# import cv2
# import PIL
# import imageio
# import pickle

Numpy: 1.23.1 Scipy: 1.9.1 Pandas 1.5.0
Pytorch Normal: 1.12.1
Pytorch Lightning: 1.7.6


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda.is_available(),cuda.device_count()

(False, 0)

# SHARE data  extraction Preprocessing

Download Wave 8 Release 8.0.0 from  https://releases.sharedataportal.eu/releases . You would require to apply for data access if you already don't have it.

In [None]:
df_ac = pd.read_stata("my/path/sharew8_rel8-0-0_ALL_datasets_stata/sharew8_rel8-0-0_ac.dta")
df_gv = pd.read_stata("my/path/sharew8_rel8-0-0_ALL_datasets_stata/sharew8_rel8-0-0_gv_health.dta")

In [None]:
df_ac = df_ac.set_index("mergeid")
df_gv= df_gv.set_index("mergeid") 
df_gv_rearr = df_gv.loc[df_ac.index]
(df_ac.index == df_gv_rearr.index).all()

In [None]:
np.min(df_gv_rearr["casp"]), np.max(df_gv_rearr["casp"])

In [None]:
df = pd.merge(df_ac, df_gv_rearr, left_index=True, right_index=True)

In [None]:
df2 = df[["ac014_","ac015_","ac016_","ac017_","ac018_","ac019_","ac020_","ac021_","ac022_","ac023_","ac024_","ac025_", "casp"]]
df3 = df2.replace({"Refusal":np.NAN, "Don't know": np.nan, "Often": 3, "Sometimes": 2, "Rarely": 1, "Never": 0})
# df3.isna().sum()
df4 = df3.dropna()
df3.shape , df4.shape

In [None]:
x_data = df4.iloc[:,0:12].to_numpy()
y_data = df4.iloc[:,12:13].to_numpy()/50 # we are linearly scaling by deviding it with 50 

In [None]:
x_train,x_test, y_train,y_test  = train_test_split(x_data, y_data, test_size=0.2, random_state=55)

In [None]:
np.save("my/path/x_train.npy", x_train)
np.save("my/path/y_train.npy", y_train)
np.save("my/path/x_test.npy", x_test)
np.save("my/path/y_test.npy", y_test)

In [None]:
# x_train = np.load("my/path/x_train.npy", allow_pickle= True)
# y_train = np.load("my/path/y_train.npy",  allow_pickle= True)

# x_test = np.load("my/path/x_test.npy",  allow_pickle= True)
# y_test = np.load("my/path/y_test.npy",  allow_pickle= True)

In [None]:
# x_train_c1,x_train_c2,y_train_c1,y_train_c2 = train_test_split(x_train,y_train, test_size=0.5, random_state=55)

In [None]:
# n_components = 3
# pca = PCA(n_components=n_components)
# components = pca.fit_transform(x)

# fig = px.scatter_matrix(
#     components,
#     color=y.reshape(y.shape[0]),
#     # labels=labels,
#     # symbol=symbol,
#     dimensions=range(n_components),
#     # title=f'Total Explained Variance: {total_var:.2f}%',
# )
# fig.update_traces(diagonal_visible=False)
# fig.show()

# Model

## Import data extracted from SHARE

In [None]:
x= np.load("my/path/x_train.npy", allow_pickle=True)
y = np.load("my/path/y_train.npy",allow_pickle=True)
x.shape, y.shape

((34315, 12), (34315, 1))

In [None]:
x_test= np.load("my/path/x_test.npy", allow_pickle=True)
y_test = np.load("my/path/y_test.npy",allow_pickle=True)
x_test.shape, y_test.shape

((8579, 12), (8579, 1))

In [None]:
#split again to get the train and validation
x_train_bar,x_val,y_train_bar,y_val= train_test_split(x,y, test_size=0.1)
 

In [None]:
#apply one hot encoding
x_train_bar_cat = F.one_hot(torch.tensor(x_train_bar.astype("int")), num_classes=4)
x_val_cat = F.one_hot(torch.tensor(x_val.astype("int")), num_classes=4)
x_test_cat = F.one_hot(torch.tensor(x_test.astype("int")), num_classes=4)

### create data for federated

In [None]:
x= np.load("my/path/x_train.npy", allow_pickle=True)
y = np.load("my/path/y_train.npy",allow_pickle=True)
x_test= np.load("my/path/x_test.npy", allow_pickle=True)
y_test = np.load("my/path/y_test.npy",allow_pickle=True)


In [None]:
data = pd.DataFrame( np.hstack ((x,y)) )

In [None]:
def split_by_fractions(df: pd.DataFrame, fracs: list, random_state: int):
    df = df.sample(frac=1.0, random_state=random_state)
    assert sum(fracs) == 1.0, 'fractions sum is not 1.0 (fractions_sum={})'.format(sum(fracs))
    remain = df.index.copy().to_frame()
    res = []
    for i in range(len(fracs)):
        fractions_sum = sum(fracs[i:])
        frac = fracs[i] / fractions_sum
        idxs = remain.sample(frac=frac, random_state=random_state).index
        remain = remain.drop(idxs)
        res.append(idxs)
    return [df.loc[idxs].reset_index(drop=True) for idxs in res]

In [None]:
p1,p2,p3,p4,p5 = split_by_fractions (data, fracs=[0.1, 0.15, 0.15, 0.3, 0.3], random_state=55)

In [None]:
xp1 = p1.iloc[:,0:12].to_numpy()
yp1 = p1.iloc[:,12:13].to_numpy()
xp2 = p2.iloc[:,0:12].to_numpy()
yp2 = p2.iloc[:,12:13].to_numpy()
xp3 = p3.iloc[:,0:12].to_numpy()
yp3 = p3.iloc[:,12:13].to_numpy()
xp4 = p4.iloc[:,0:12].to_numpy()
yp4 = p4.iloc[:,12:13].to_numpy()
xp5 = p5.iloc[:,0:12].to_numpy()
yp5 = p5.iloc[:,12:13].to_numpy()

In [None]:
xp1_cat= F.one_hot(torch.tensor(xp1.astype("int")), num_classes=4)
xp2_cat= F.one_hot(torch.tensor(xp2.astype("int")), num_classes=4)
xp3_cat= F.one_hot(torch.tensor(xp3.astype("int")), num_classes=4)
xp4_cat= F.one_hot(torch.tensor(xp4.astype("int")), num_classes=4)
xp5_cat= F.one_hot(torch.tensor(xp5.astype("int")), num_classes=4)

In [None]:

np.save("my/path/xp1_cat.npy", xp1_cat)
np.save("my/path/xp2_cat.npy", xp2_cat)
np.save("my/path/xp3_cat.npy", xp3_cat)
np.save("my/path/xp4_cat.npy", xp4_cat)
np.save("my/path/xp5_cat.npy", xp5_cat)

### Load data for individual central mode training 

This part is only for training on individual client or participant; 

Do not run for centralmode, for central model directly go to hyperparameters 

In [None]:

xp1_cat = np.load("my/path/xp1_cat.npy", allow_pickle= True)
xp2_cat = np.load("my/path/xp2_cat.npy", allow_pickle= True)
xp3_cat = np.load("my/path/xp3_cat.npy", allow_pickle= True)
xp4_cat = np.load("my/path/xp4_cat.npy",allow_pickle= True)
xp5_cat = np.load("my/path/xp5_cat.npy",allow_pickle= True)

yp1 = np.load("my/path/yp1.npy", allow_pickle= True)
yp2 = np.load("my/path/yp2.npy",allow_pickle= True)
yp3 = np.load("my/path/yp3.npy",allow_pickle= True)
yp4 = np.load("my/path/yp4.npy",allow_pickle= True)
yp5 = np.load("my/path/yp5.npy",allow_pickle= True)

In [None]:
# Only change this part and run the rest
x = xp5_cat
y = np.array(yp5, dtype= 'float64')

In [None]:
x_train_bar_cat,x_val_cat,y_train_bar,y_val= train_test_split(x,y, test_size=0.1)
# remember these are already cartegorical

In [None]:
x_test_cat = F.one_hot(torch.tensor(x_test.astype("int")), num_classes=4)

## Hyperparameter

In [None]:
input_size = (12,4)
num_epochs = 300
# every_n_train_steps=25
batch_size = 128
learning_rate = 1e-4


## Dataloaders

In [None]:
x_train_tt = torch.tensor(x_train_bar_cat,dtype=torch.float)
x_val_tt = torch.tensor(x_val_cat, dtype=torch.float)
x_test_tt = torch.tensor(x_test_cat, dtype=torch.float)
y_train_tt = torch.tensor(y_train_bar,dtype=torch.float)
y_val_tt = torch.tensor(y_val, dtype=torch.float)
y_test_tt = torch.tensor(y_test, dtype=torch.float)

  x_test_tt = torch.tensor(x_test_cat, dtype=torch.float)#.unsqueeze(1)


In [None]:
train_dataset = TensorDataset(x_train_tt, y_train_tt)
val_dataset = TensorDataset(x_val_tt, y_val_tt)
test_dataset = TensorDataset(x_test_tt, y_test_tt)

In [None]:
# change num workers as per number of cores in cpu available for computation 
training_loader = DataLoader(dataset= train_dataset, batch_size=batch_size, shuffle= True, num_workers=8)
validation_loader = DataLoader(dataset= val_dataset, batch_size=batch_size, shuffle= False, num_workers=8)
test_loader = DataLoader (dataset= test_dataset, batch_size=batch_size, shuffle= False, num_workers=8)

## Architecture

In [None]:
class Pred(pl.LightningModule):
    def __init__(self, input_size,learning_rate ):
        super(Pred, self).__init__()
        self.input_size = input_size
        self.learning_rate = learning_rate
        self.fc1 = nn.Linear(48, 16)
        self.fc2 = nn.Linear(16, 1)
        self.drop = nn.Dropout(p=0.3)
        self.bn1 = nn.BatchNorm1d(num_features=16, eps=1e-5, momentum=0.1, affine=False, track_running_stats=True )
        self.tanh = nn.Tanh()

    def forward(self,x): 
        x= torch.flatten(x, start_dim=1)
        x = self.fc1(x)
        x= self.drop(x)
        x= self.bn1(x)
        x = self.fc2(x)
        x = self.tanh(x)
        return (x)

    def training_step(self, batch, batch_idx):
        data, label = batch
        y_hat = self(data)
        loss = F.mse_loss(y_hat, label) 
        self.log("train/loss", loss, on_epoch=True)
        return loss


    def validation_step(self, batch, batch_idx):
        data,label = batch
        y_hat = self(data)
        loss = F.mse_loss(y_hat, label) 
        self.log("valid/val_loss", loss, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer    

    def on_save_checkpoint(self, checkpoint):
        checkpoint["hyperparameters"] = (input_size, num_epochs, batch_size, learning_rate)
    
    def on_load_checkpoint(self, checkpoint):
        input_size, num_epochs, batch_size, learning_rate = checkpoint["hyperparameters"] 


In [None]:
model = Pred(input_size=input_size, learning_rate=learning_rate)

In [None]:
#log in to weights and biases if you want to monitor the model, else leave this cell
wandb.login()

In [None]:
! mkdir my/path/m0t0

# Training 

Change the following code on availibility of your GPU, in our case we trained on CPU

documentation of lightning trainer : https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html

change the logger arguement in trainer if you are not using weights and biases. 

In [None]:
wandb_logger = WandbLogger(name='m0t0',project='DL') 
wandb_logger.watch(model, log="all", log_graph=True) # log frequency != num epochs to print loos rather num steps 
checkpoint_callback = ModelCheckpoint(dirpath='my/path/m0t0',
                                      filename='{epoch}-{training_loss:.2f}-{val_loss:.2f}',
                                      # monitor="valid/val_loss", mode = "min",
                                      # every_n_train_steps=every_n_train_steps,
                                      # every_n_epochs= 25
                                      verbose=True,
                                      # save_on_train_epoch_end=True,
                                      save_last = True,
                                      every_n_epochs= 25,
                                      save_top_k=-1
                                      )
trainer = pl.Trainer(max_epochs=num_epochs,
                     callbacks=[ checkpoint_callback],
                    #  accelerator="gpu", amp_backend="native")
                     logger=wandb_logger)
                    #  callbacks=[EarlyStopping(monitor="val/val_loss", mode="min")])
# trainer = pl.Trainer
trainer.fit(model=model, train_dataloaders=training_loader, val_dataloaders= validation_loader)

# trainer.test(dataloaders=test_loader)

# Model evaluation

If you want to load aved model from other epochs use following cell else not. 

In [None]:
checkpoint= "my/path/m0t0/epoch=299-training_loss=0.00-val_loss=0.00.ckpt"
model = Pred(input_size=input_size, learning_rate=learning_rate)
model_trained = model.load_from_checkpoint(checkpoint, input_size=input_size, learning_rate=learning_rate)

Calculate accuracy metrics in whole 20 percent test dataset that we kept separate.

In [None]:
# pred = model(x_test_tt).detach().numpy()
pred = model_trained(x_test_tt).detach().numpy()
# pred.shape, y_test.shape
#RMSE
print ("RMSE:", (np.sum(np.square(pred-y_test))/y_test.shape[0])**0.5)
print("MAE:", np.sum(np.abs(pred-y_test))/y_test.shape[0])

print("R_square:", 1 - (np.sum(np.square(pred-y_test))) / np.sum(np.square(y_test-np.average(y_test))) )

If you are training individual model then use this to calculate accuracy metrics tested on own test set. This is not required for Central model.

In [None]:
# pred = model(x_val_tt).detach().numpy()
pred = model_trained(x_val_tt).detach().numpy()
# pred.shape, y_val.shape
#RMSE
print ("RMSE:", (np.sum(np.square(pred-y_val))/y_val.shape[0])**0.5)
print("MAE:", np.sum(np.abs(pred-y_val))/y_val.shape[0])

print("R_square:", 1 - (np.sum(np.square(pred-y_val))) / np.sum(np.square(y_val-np.average(y_val))) )

In [None]:
# Finish weights and bias task. 

wandb.finish