In [22]:
from torch.profiler import profile, tensorboard_trace_handler, ProfilerActivity, schedule
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import time 

class MLP(nn.Module):
    def __init__(self,c_in = 4,h_dim1 = 16, h_dim2 = 16, c_out = 1,device = 'cpu'):
        super(MLP,self).__init__()
        self.linear1 = nn.Linear(c_in,h_dim1)
        self.linear2 = nn.Linear(h_dim1,h_dim2)
        self.linear3 = nn.Linear(h_dim2,c_out)
        self.relu = nn.ReLU()
        self.device = device
    def forward(self,x):
        return(self.linear3(self.relu(self.linear2(self.relu(self.linear1(x))))))
    
def load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last):
    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]
    prof =  profile(activities=activities,
                    schedule=schedule(wait=1, warmup=1, active=12, repeat=1),
                    on_trace_ready=tensorboard_trace_handler('./profiler/trial_profiler'),
                    profile_memory=True,
                    record_shapes=False, 
                    with_stack=False,
                    with_flops=False
                    )

    dataloader = DataLoader(inputs,batch_size=B,shuffle =True ,num_workers=num_workers, 
                            persistent_workers= persistent_workers,
                            pin_memory=pin_memory,
                            prefetch_factor=None if num_workers==0 else prefetch_factor, #2,3,4,5...
                            drop_last=drop_last) 
    
    return(prof,dataloader)


def load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device):
    loss = nn.MSELoss()
    model = MLP(c_in,h_dim1, h_dim2, c_out,device)
    optimizer = torch.optim.SGD(model.parameters(), 1e-3)

    return(loss,model,optimizer)


def train_model(model,optimizer,loss,prof,epochs,dataloader,device):
    t_epochs,t_batchs, t_coms, t_forwards, t_backwards,total_time = 0,0,0,0,0,0
    total_t = time.time()
    with prof:
        for epoch in range(epochs):
            epoch1 = time.time()
            t_epoch = time.time()
            for x,y in dataloader:
                t_batch = time.time()
                t_com = time.time()
                x,y = x.to(device),y.to(device)
                t_coms += time.time()-t_com

                t_forward = time.time()
                pred = model(x)
                t_forwards += time.time()-t_forward

                pred = pred.squeeze()

                t_backward = time.time()
                l = loss(pred,y)
                t_backwards += time.time() - t_backward

                optimizer.zero_grad()
                l.backward()
                optimizer.step()
                t_batchs += time.time()-t_batch
            t_epochs += time.time()-t_epoch
            if epoch == 0:
                t_epochs = 0
                epochs1 = time.time()-epoch1
            prof.step()
    total_time = time.time() - total_t

    print(f"Total time: {total_time} \nTime per epoch: {(t_epochs)/(epochs-1)} \
        \nTime first epoch: {(epochs1)} \nTime Communication: {t_coms}\
            \nTime forwards: {t_forwards} \nTime Backward: {t_backwards}\
            ")


## Inputs

In [23]:
B = 8
T = 200
L = 4
N = 10
epochs = 12
device = 'cuda' if torch.cuda.is_available() else 'cpu'
c_in,h_dim1,h_dim2,c_out = 4, 16, 16, 1

X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = list(zip(X,Y))

In [24]:
num_workers= 0 #4
persistent_workers = False
pin_memory = False
prefetch_factor = 2
drop_last = False

(prof,dataloader) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
train_model(model,optimizer,loss,prof,epochs,dataloader,device)

Total time: 8.773043394088745 
Time per epoch: 0.07012965462424538         
Time first epoch: 0.03415632247924805 
Time Communication: 0.002839803695678711            
Time forwards: 0.08481502532958984 
Time Backward: 0.04641318321228027            


In [25]:
num_workers= 2 #4
persistent_workers = False
pin_memory = False
prefetch_factor = None
drop_last = False

(prof,dataloader) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
train_model(model,optimizer,loss,prof,epochs,dataloader,device)

Total time: 47.92680335044861 
Time per epoch: 3.499491821635853         
Time first epoch: 3.528473377227783 
Time Communication: 0.008521080017089844            
Time forwards: 0.05214691162109375 
Time Backward: 0.03460192680358887            


In [26]:
num_workers= 4 #4
persistent_workers = False
pin_memory = False
prefetch_factor = None
drop_last = False

(prof,dataloader) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
train_model(model,optimizer,loss,prof,epochs,dataloader,device)

Total time: 54.66448473930359 
Time per epoch: 4.0494403405623         
Time first epoch: 4.000981569290161 
Time Communication: 0.0038945674896240234            
Time forwards: 0.09450149536132812 
Time Backward: 0.039552927017211914            


In [27]:
num_workers= 2 #4
persistent_workers = False
pin_memory = False
prefetch_factor = 2
drop_last = False

(prof,dataloader) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
train_model(model,optimizer,loss,prof,epochs,dataloader,device)

Total time: 47.87871742248535 
Time per epoch: 3.49210169098594         
Time first epoch: 3.538088798522949 
Time Communication: 0.003049135208129883            
Time forwards: 0.08310961723327637 
Time Backward: 0.0594942569732666            


In [28]:
num_workers= 2 #4
persistent_workers = True
pin_memory = True
prefetch_factor = 2
drop_last = True

(prof,dataloader) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
train_model(model,optimizer,loss,prof,epochs,dataloader,device)

Total time: 9.682175874710083 
Time per epoch: 0.06153908642855557         
Time first epoch: 3.1031179428100586 
Time Communication: 0.032616376876831055            
Time forwards: 0.032059431076049805 
Time Backward: 0.044452667236328125            
