In [1]:
from torch.profiler import profile, tensorboard_trace_handler, ProfilerActivity, schedule
from torch.utils.data import Dataset, DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp 

from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import torch
import time 
import os 

from dl_models.CNN_based_model import CNN

import psutil 


# ============ Profiler Memory Usage ============
def get_memory_usage(max_memory):
    print(f"\nMax GPU memory allocated: {torch.cuda.max_memory_allocated() / 1024**3} GB")
    print(f"Max GPU memory cached: {torch.cuda.max_memory_reserved() / 1024**3} GB")
    print(f"Max CPU memory allocated: {max_memory} GB")
# ============ .... ============    

# ============ DDP ============
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = '137.121.170.69'
    #os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()
# ============ .... ============   



class CustomDataset(Dataset):
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self,idx):
        return self.X[idx], self.Y[idx]

class MLP(nn.Module):
    def __init__(self,c_in = 4,h_dim1 = 16, h_dim2 = 16, c_out = 1):
        super(MLP,self).__init__()
        self.linear1 = nn.Linear(c_in,h_dim1)
        self.linear2 = nn.Linear(h_dim1,h_dim2)
        self.linear3 = nn.Linear(h_dim2,c_out)
        self.relu = nn.ReLU()
    def forward(self,x):
        return(self.linear3(self.relu(self.linear2(self.relu(self.linear1(x))))))
    
def load_profile_dataloader(dataset,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last,dataparallel=False,prof = False):
    if prof:
        activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]
        prof =  profile(activities=activities,
                        schedule=schedule(wait=1, warmup=1, active=12, repeat=1),
                        on_trace_ready=tensorboard_trace_handler('./profiler/trial_profiler'),
                        profile_memory=True,
                        record_shapes=False, 
                        with_stack=False,
                        with_flops=False
                        )
    else:
        prof = None
    
    if dataparallel:
        sampler = torch.utils.data.distributed.DistributedSampler(dataset,
                                                                num_replicas=2,
                                                                rank=0,
                                                                shuffle=True)
    else: 
        sampler = None
        
    

    dataloader = DataLoader(dataset,batch_size=B,shuffle =False if dataparallel else True ,
                            num_workers=num_workers, 
                            persistent_workers= False if num_workers == 0 else persistent_workers,
                            pin_memory=pin_memory,
                            prefetch_factor=None if num_workers==0 else prefetch_factor, #2,3,4,5...
                            drop_last=drop_last,
                           sampler = sampler 
                           )
    
    return(prof,dataloader,sampler)


def load_model_loss_opt(c_in=1,h_dim1=16,h_dim2=16,c_out=1,device='cuda',dataparallel = False,model_name = 'MLP',H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= (1,2),input_shape = [6000,40,8]
                        ,compile_model=False,memory_format_last = False, compile_backend = 'inductor', rank = 0, world_size = 2):
    loss = nn.MSELoss()
    if model_name == 'MLP': 
        model = MLP(c_in,h_dim1, h_dim2, c_out).to(device)
    if model_name == 'CNN':
        model = CNN(c_in,L,H_dims, C_outs, kernel_size, input_shape = input_shape).to(device)
        if memory_format_last:
            model = model.to(memory_format = torch.channels_last)
        
    # DataParallel:
    if dataparallel:
        dist.init_process_group(backend='nccl',# init_method='env://',
                                world_size=world_size, 
                                rank=rank)
        model = DDP(model,device_ids = [rank])
    # ....
    
    # Compiler :
    if compile_model:
        model = torch.compile(model,backend = compile_backend)
        #model = torch.jit.script(model)
    # ...
        
    optimizer = torch.optim.SGD(model.parameters(), 1e-3)
    
    print('number of total parameters: {}'.format(sum([p.numel() for p in model.parameters()])))
    print('number of trainable parameters: {}'.format(sum([p.numel() for p in model.parameters() if p.requires_grad])))

    return(loss,model,optimizer)


def training(model,optimizer,loss,prof,epochs,dataloader,device,scaler,memory_format_last,mixed_precision):
    global max_memory
    t_epochs,t_batchs, t_coms, t_forwards, t_backwards,total_time = 0,0,0,0,0,0
    for epoch in range(epochs):
        if sampler is not None:
            sampler.set_epoch(epoch)
        epoch1 = time.time()
        t_epoch = time.time()
        for x,y in dataloader:
            
            # ==== Mesure le temps de lecture CPU -> GPU
            t_com = time.time()
            if  memory_format_last:
                x,y = x.to(device,memory_format = torch.channels_last),x.to(device)
            else:
                x,y = x.to(device),y.to(device)
            t_coms += time.time()-t_com
            # ==== ...
            
            
            # ==== Mesure le temps total d'entrainement des batch (forward + backward + Optimizer update):
            t_batch = time.time()

            if mixed_precision:
                with autocast():
                    # Mesure temps de backward: 
                    t_forward = time.time()
                    pred = model(x)
                    t_forwards += time.time()-t_forward
                    # ...
                        
                    if len(y.size()) != len(pred.size()): 
                        pred = pred.squeeze()   
                        
                    # Mesure temps de backward: 
                    t_backward = time.time()
                    l = loss(pred,y)
                    t_backwards += time.time() - t_backward
                    # ...    
    
            else:
                # Mesure temps de backward: 
                t_forward = time.time()
                pred = model(x)
                t_forwards += time.time()-t_forward
                # ...
                        
                if len(y.size()) != len(pred.size()): 
                    pred = pred.squeeze()
                
                # Mesure temps de backward: 
                t_backward = time.time()
                l = loss(pred,y)
                t_backwards += time.time() - t_backward
                # ...
            
            if scaler is not None:
                scaler.scale(l).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.zero_grad()
                l.backward()
                optimizer.step()

            t_batchs += time.time()-t_batch
            # ==== ...
            
        t_epochs += time.time()-t_epoch
        if epoch == 0:
            epochs1 = time.time()-epoch1
        if prof:
            prof.step()
            
        process = psutil.Process()
        max_memory = max(max_memory, process.memory_info().rss / 1024**3)
    get_memory_usage(max_memory)
    return(t_epochs,t_batchs,epochs1,t_coms,t_forwards,t_backwards)

def train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision):
    total_t = time.time()
    if sampler is not None:
        scaler = GradScaler()
    else:
        scaler = None
    if prof is not None:
        with prof:
            (t_epochs,epochs1,t_coms,t_forwards,t_backwards) = training(model,optimizer,loss,prof,epochs,dataloader,device,scaler,memory_format_last,mixed_precision)
    else:
        (t_epochs,t_batchs,epochs1,t_coms,t_forwards,t_backwards) = training(model,optimizer,loss,prof,epochs,dataloader,device,scaler,memory_format_last,mixed_precision)
        
    total_time = time.time() - total_t
    throughput = f"{'{:.0f}'.format(T*epochs/total_time)} sequences /s "

    print(f"Throughput: {throughput} \nTotal time: {total_time} \nTime per epoch: {(t_epochs)/(epochs-1)} \
    \nTotal TimeEpoch {t_epochs} and Total TimeBatch: {t_batchs}. Difference from CPU dataloader : {t_epochs-t_batchs} \
        \nTime first epoch: {(epochs1)} \nTime Communication: {t_coms}\
            \nTime forwards: {t_forwards} \nTime Backward: {t_backwards}\
            ")
    
    
    
class CNN(nn.Module):
    def __init__(self,c_in,L,H_dims =[64,128,64], C_outs = [64,32,1], kernel_size = (1,2),dilation = 1, stride = 1,args_embedding = None,dic_class2rpz=None, input_shape = [6000,40,8]):
        super().__init__()
    
        self.c_out = C_outs[-1]
        self.dropout = nn.Dropout(0.2)
        if len(input_shape) == 4:
            self.Convs = nn.ModuleList([nn.Conv2d(c_in_, c_out_, kernel_size,padding=0,dilation=dilation) for c_in_,c_out_ in zip([c_in]+H_dims[:-1], H_dims)])
            l_out_add = (2*0 - dilation*(kernel_size[1]-1) -1)/stride + 1
        if len(input_shape)== 3:
            self.Convs = nn.ModuleList([nn.Conv1d(c_in_, c_out_, kernel_size,padding=0,dilation=dilation) for c_in_,c_out_ in zip([c_in]+H_dims[:-1], H_dims)])
            l_out_add = (2*0 - dilation*(kernel_size[0]-1) -1)/stride + 1

        
        l_out = int(L/stride**len(H_dims) + sum([l_out_add/stride**k for k in range(len(H_dims))])) 
        
        self.l_out = l_out


        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()


        self.Dense_outs = nn.ModuleList([nn.Linear(c_in_,c_out_) for c_in_,c_out_ in zip([l_out*H_dims[-1]]+C_outs[:-1], C_outs)])

            
    def forward(self,x):
        if len(x.shape) == 3:
            B,N,L = x.shape
            C = 1
            x = x.unsqueeze(2)
            x = x.reshape(B*N,C,L)
        if len(x.shape) == 4:
            B,C,N,L = x.shape
            x.reshape(B*N,C,L)
             
        # Conv Layers :        
        for conv in self.Convs:
            x = self.dropout(self.relu(conv(x)))

        # Flatten :
        if False:
            x = x.permute(0,2,1,3)
            x = x.reshape(x.shape[0]*x.shape[1],-1)
        x = self.flatten(x)
        # Output Module : 
        for dense_out in self.Dense_outs[:-1]:
            x = self.dropout(self.relu(dense_out(x)))

        x = self.Dense_outs[-1](x)    # No activation
        # Reshape 
        x = x.reshape(B,N,self.c_out)
        return(x)


## Test CNN. Profiling Code. Pourquoi c'est si long sur mon framework ?

In [2]:
model_name = 'CNN'


B = 256 #8
T = 6000
L = 8
N = 40
epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'
c_in,h_dim1,h_dim2,c_out = L, 64, 64, 1

# Profiling: 
max_memory = 0
# ...
    
    
# DataLoader: 
num_workers = 0
persistent_workers = False
pin_memory = False
prefetch_factor = None
drop_last = False
# ....

# DataParallel:
dataparallel = False
# ...

# Compiler :
compile_model = False
# ...

# Memory format = channel_last:
memory_format_last = False
# ....


# Mixed-Precision : FP16 Tesor Core
# mplémenter l'autocasting (le changement de précision, FP32 à FP16) dans le forward , avec la ligne with autocast(): dans la boucle de TRAINING ET la boucle de VALIDATION
mixed_precision = False
# ...


# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
if model_name == 'MLP':
    (loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device,model_name =model_name,compile_model = compile_model)
if model_name == 'CNN':
    # kernel_size= (1,2)
    kernel_size = (2,)
    (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                 compile_model = compile_model,memory_format_last = memory_format_last)

train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.298828125 GB
Max CPU memory allocated: 3.2891807556152344 GB
Throughput: 23792 sequences /s  
Total time: 12.609472274780273 
Time per epoch: 0.2571245018316775     
Total TimeEpoch 12.599100589752197 and Total TimeBatch: 6.229737758636475. Difference from CPU dataloader : 6.369362831115723         
Time first epoch: 1.9661011695861816 
Time Communication: 4.335877180099487            
Time forwards: 2.5469892024993896 
Time Backward: 0.09984564781188965            


## Ajout de la mixed Precision : 
Aucun changement pour des petits batch size (8). Mais x2 pour un batch-size plus grand ! (256)
Etonnament, alors que 'with autocast():' n'est pas censé être compris dans le chargement du dataloader, on remarque un temps bien plus court pour çalorsque la mixed precision est activée (et pas pour le forward + calcul de loss)

In [4]:
# Profiling: 
max_memory = 0
# ...
    
    
# Mixed-Precision : FP16 Tesor Core
mixed_precision = True
# ...


# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                 compile_model = compile_model,memory_format_last = memory_format_last)
train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.300487518310547 GB
Throughput: 33981 sequences /s  
Total time: 8.828543663024902 
Time per epoch: 0.1799228288689438     
Total TimeEpoch 8.816218614578247 and Total TimeBatch: 5.945619106292725. Difference from CPU dataloader : 2.8705995082855225         
Time first epoch: 0.19704151153564453 
Time Communication: 0.26018214225769043            
Time forwards: 1.5356342792510986 
Time Backward: 0.15899991989135742            


## Ajout de Num workers: 
Pour un petit batch, num_worker = 1 semble meilleur (x2). Pour un Gros Batch, pas du tout, baisse de throughput ...

In [6]:
# Profiling: 
max_memory = 0
# ...
    

# DataLoader: 
persistent_workers = True
pin_memory = True
prefetch_factor = 3
drop_last = False
# ....

# Mixed-Precision : FP16 Tesor Core
mixed_precision = False
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....

for num_workers in [0,1,2]:
    print(f'------------------------------')
    print(f'\nNum workers: {num_workers}')
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                     compile_model = compile_model,memory_format_last = memory_format_last)

    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

------------------------------

Num workers: 0
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.305156707763672 GB
Throughput: 29619 sequences /s  
Total time: 10.128560304641724 
Time per epoch: 0.20653580159557108     
Total TimeEpoch 10.120254278182983 and Total TimeBatch: 2.8240091800689697. Difference from CPU dataloader : 7.296245098114014         
Time first epoch: 0.21078705787658691 
Time Communication: 5.36291241645813            
Time forwards: 1.031101942062378 
Time Backward: 0.10753703117370605            
------------------------------

Num workers: 1
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.305248260498047 GB
Throughput: 26092 sequences /s  
Total time: 11.4976167678833 
Time per epoch: 0.2344

## Choix de num_workers avec la mixed precision activée:
Visiblement, il vaut mieux utiliser 1 Worker avec la mixed precision

In [7]:
# Profiling: 
max_memory = 0
# ...
    
# DataLoader: 
persistent_workers = True
pin_memory = True
prefetch_factor = 3
drop_last = False
# ....

# Mixed-Precision : FP16 Tesor Core
mixed_precision = True
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....

for num_workers in [0,1,2]:
    print(f'------------------------------')
    print(f'\nNum workers: {num_workers}')
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                     compile_model = compile_model,memory_format_last = memory_format_last)

    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

------------------------------

Num workers: 0
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.3061561584472656 GB
Throughput: 47927 sequences /s  
Total time: 6.259514093399048 
Time per epoch: 0.12758264736253389     
Total TimeEpoch 6.25154972076416 and Total TimeBatch: 3.850377321243286. Difference from CPU dataloader : 2.401172399520874         
Time first epoch: 0.14849138259887695 
Time Communication: 0.10798120498657227            
Time forwards: 1.3080604076385498 
Time Backward: 0.13345575332641602            
------------------------------

Num workers: 1
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.3061752319335938 GB
Throughput: 59691 sequences /s  
Total time: 5.0258495807647705 
Time per epoch: 0.

## Choix de Batch-size: 

In [8]:
# Profiling: 
max_memory = 0
# ...
    
# DataLoader: 
persistent_workers = True
pin_memory = True
prefetch_factor = 3
drop_last = False
# ....

# Mixed-Precision : FP16 Tesor Core
mixed_precision = True
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....

for batch_size in [8,16,32,64,128,256,512,1024]:
    print(f'------------------------------')
    print(f'\nBatch size: {batch_size}')
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,batch_size,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                     compile_model = compile_model,memory_format_last = memory_format_last)

    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

------------------------------

Batch size: 8
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.30078125 GB
Max CPU memory allocated: 3.314830780029297 GB
Throughput: 2437 sequences /s  
Total time: 123.12023329734802 
Time per epoch: 2.5123647232444917     
Total TimeEpoch 123.1058714389801 and Total TimeBatch: 111.8541350364685. Difference from CPU dataloader : 11.251736402511597         
Time first epoch: 2.1629693508148193 
Time Communication: 6.7922515869140625            
Time forwards: 38.89456915855408 
Time Backward: 4.156979322433472            
------------------------------

Batch size: 16
number of total parameters: 55809
number of trainable parameters: 55809

Max GPU memory allocated: 0.2066488265991211 GB
Max GPU memory cached: 0.302734375 GB
Max CPU memory allocated: 3.3161048889160156 GB
Throughput: 4542 sequences /s  
Total time: 66.05665755271912 
Time per epoch: 1.3478025

## Ajout de DataParallelism: 

In [85]:
# Profiling: 
max_memory = 0
# ...
    
num_workers = 1
persistent_workers = True
pin_memory = True
prefetch_factor = 3
drop_last = False

# DataParallel:
dataparallel = True
world_size = 2
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


def mp_train(rank,world_size,X,Y,B,num_workers, epochs, device):
    setup(rank, world_size)
    
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = dataparallel,model_name =model_name,input_shape = [6000,40,8],
                                                     compile_model = compile_model,memory_format_last = memory_format_last,rank = rank, world_size = world_size)

    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)
    
    cleanup()

t0 = time.time()
mp.spawn(mp_train, args=(world_size, X, Y, B, num_workers, epochs, device), nprocs=world_size, join=True) 
print('Parallelised time: ',time.time()-t0)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'mp_train' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'mp_train' on <module '__main__' (built-in)>


ProcessExitedException: process 1 terminated with exit code 1

## Ajout de Compiler: 
Inutile, voir plus mauvais ...
L'utilisation de 'inductor' semble cependant avoir améliorer le temps de forward

In [76]:
# Profiling: 
max_memory = 0
# ...
    
# Compiler :
#torch._dynamo.config.suppress_errors = True
torch._dynamo.reset()
compile_model = True
backend = 'cudagraphs'
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = False,model_name =model_name,input_shape = [6000,40,8],
                                                 compile_model = compile_model,memory_format_last = memory_format_last,compile_backend = backend)

train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)



Throughput: 22414 sequences /s  
Total time: 13.384491920471191 
Time per epoch: 0.2731508527483259     
Total TimeEpoch 13.384391784667969 and Total TimeBatch: 5.277243614196777. Difference from CPU dataloader : 8.107148170471191         
Time first epoch: 1.5007538795471191 
Time Communication: 6.448937892913818            
Time forwards: 2.015937328338623 
Time Backward: 0.0968012809753418            


In [77]:
# Profiling: 
max_memory = 0
# ...
    
# Compiler :
torch._dynamo.config.suppress_errors = True
torch._dynamo.reset()
compile_model = True
backend = 'inductor'
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = False,model_name =model_name,input_shape = [6000,40,8],
                                                 compile_model = compile_model,memory_format_last = memory_format_last,compile_backend = backend)

train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

[2024-06-17 15:13:22,355] torch._dynamo.convert_frame: [ERROR] WON'T CONVERT forward /tmp/ipykernel_260934/332402667.py line 223 
due to: 
Traceback (most recent call last):
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 25, in <module>
    from pkg_resources import packaging  # type: ignore[attr-defined]
ImportError: cannot import name 'packaging' from 'pkg_resources' (/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/site-packages/pkg_resources/__init__.py)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/anaconda3/envs/pytorch-2.0.1_py-3.10.5/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 675, in call_user_compiler
    raise BackendCompilerFailed(self.compiler_fn, e) from e
torch._dynamo.exc.BackendCompilerFailed: debug_wrapper raised ImportError: cannot import name 'packaging' from 'pkg_resources' (/root/anaconda3

Throughput: 24224 sequences /s  
Total time: 12.384198427200317 
Time per epoch: 0.25273663657052176     
Total TimeEpoch 12.384095191955566 and Total TimeBatch: 4.791990280151367. Difference from CPU dataloader : 7.592104911804199         
Time first epoch: 0.63791823387146 
Time Communication: 6.234528541564941            
Time forwards: 1.3105883598327637 
Time Backward: 0.09683609008789062            


In [None]:
# Profiling: 
max_memory = 0
# ...
    
# DataLoader: 
num_workers = 2
persistent_workers = True
pin_memory = True
prefetch_factor = 2
drop_last = False
# ....

# Mixed-Precision : FP16 Tesor Core
mixed_precision = False
# ...

# Inputs : 
X,Y=  torch.randn(T,N,L),torch.randn(T,N)
inputs = CustomDataset(X,Y)
# ....


(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
(loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = False,model_name =model_name,input_shape = [6000,40,8],
                                                 compile_model = compile_model,memory_format_last = memory_format_last)

train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device,T,memory_format_last,mixed_precision)

## Memory-format channel Last pour le CNN : Uniquement pour des inputs 4D

In [55]:
# Profiling: 
max_memory = 0
# ...
    
    
# Memory format = channel_last:
memory_format_last = True
# ....


RuntimeError: required rank 4 tensor to use channels_last format

## Test avec different worker, sur model moyen (petit), input shape proche des miens : 

In [43]:
blabla=  1568.73849
f"{'{:.0f}'.format(blabla)} sequences /s "

'1569 sequences /s '

In [None]:
model_name = 'CNN'

B = 8
T = 6000
L = 8
N = 40
epochs = 300
device = 'cuda' if torch.cuda.is_available() else 'cpu'
c_in,h_dim1,h_dim2,c_out = L, 64, 64, 1

persistent_workers = False
pin_memory = False
prefetch_factor = None
drop_last = False

X,Y=  torch.randn(T,N,L),torch.randn(T,N)
#inputs = list(zip(X,Y))
inputs = CustomDataset(X,Y)

for num_workers in [0,1,2,4,6,8]:
    print('\nNum workers:',num_workers)
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    if model_name == 'MLP':
        (loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device,model_name =model_name)
    if model_name == 'CNN':
        # kernel_size= (1,2)
        kernel_size = (2,)
        (loss,model,optimizer) = load_model_loss_opt(H_dims =[64,128,64],C_outs = [64,32,1],kernel_size= kernel_size,c_in=1,device=device,dataparallel = False,model_name =model_name,input_shape = [6000,40,8])
        
    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device)

## Essaie avec toute les données chargées initialement en mémoire : 
Ici, impossible de les charger en mémoire en amont pour du num_worker > 0. 

In [24]:
B = 8
T = 6000
L = 8
N = 40
epochs = 300

persistent_workers = False
pin_memory = False
prefetch_factor = None
drop_last = False

device = 'cuda' if torch.cuda.is_available() else 'cpu'


c_in,h_dim1,h_dim2,c_out = L, 64, 64, 1

X,Y=  torch.randn(T,N,L).to(device),torch.randn(T,N,1).to(device)
inputs = CustomDataset(X,Y)

for num_workers in [0]:
    print('\nNum workers:',num_workers)
    (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
    (loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
    train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device)


Num workers: 0
Total time: 0.634476900100708 
Time per epoch: 0.0         
Time first epoch: 0.634458065032959 
Time Communication: 0.0035734176635742188            
Time forwards: 0.11711645126342773 
Time Backward: 0.038048505783081055            


## Ajout de : persistent_worker, pin_memory, prefetch_factor = 2 

In [22]:
persistent_workers = True
pin_memory = True
drop_last = True

X,Y=  torch.randn(T,N,L),torch.randn(T,N,1)
inputs = CustomDataset(X,Y)

for num_workers in [0,1]:
    for prefetch_factor in [None,2,4,8]:
        print('\nNum workers:',num_workers,'and prefetch_factor: ',prefetch_factor)
        (prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last)
        (loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device)
        train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device)


Num workers: 0 and prefetch_factor:  None
Total time: 0.7066452503204346 
Time per epoch: 0.0         
Time first epoch: 0.7066271305084229 
Time Communication: 0.03134489059448242            
Time forwards: 0.11990141868591309 
Time Backward: 0.03914356231689453            

Num workers: 0 and prefetch_factor:  2
Total time: 0.6967222690582275 
Time per epoch: 0.0         
Time first epoch: 0.6967051029205322 
Time Communication: 0.031247854232788086            
Time forwards: 0.1203000545501709 
Time Backward: 0.03901362419128418            

Num workers: 0 and prefetch_factor:  4
Total time: 0.6892411708831787 
Time per epoch: 0.0         
Time first epoch: 0.6892240047454834 
Time Communication: 0.031081199645996094            
Time forwards: 0.11827659606933594 
Time Backward: 0.0383143424987793            

Num workers: 0 and prefetch_factor:  8
Total time: 0.6936221122741699 
Time per epoch: 0.0         
Time first epoch: 0.6936051845550537 
Time Communication: 0.03083586692810

## Choice of best config: 

In [71]:
num_workers= 1 #4
persistent_workers = True
pin_memory = True
prefetch_factor = 4
drop_last = False

## Trial with Dataparallel : 

In [None]:
import os
import socket

def find_free_port():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('', 0))
    addr, port = s.getsockname()
    s.close()
    return port

free_port = find_free_port()
os.environ['MASTER_ADDR'] = '137.121.170.69'
os.environ['MASTER_PORT'] = 8888 #Ne fonctionne pas #str(free_port) 
print(f"Using port {free_port} for MASTER_PORT")

dataparallel = True

(prof,dataloader,sampler) = load_profile_dataloader(inputs,B,num_workers,persistent_workers,pin_memory,prefetch_factor,drop_last,dataparallel)
(loss,model,optimizer) = load_model_loss_opt(c_in,h_dim1,h_dim2,c_out,device,dataparallel)
train_model(model,optimizer,loss,prof,epochs,dataloader,sampler,device)