In [1]:
import torch
import psutil
import GPUtil

import torch
from torch import nn
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import COCODataset
from tqdm import tqdm
import json
import time

def print_memory_usage():
    # CPU Memory
    cpu_memory = psutil.virtual_memory()
    print(f"CPU Memory: {cpu_memory.used / (1024 ** 3):.2f} GB / {cpu_memory.total / (1024 ** 3):.2f} GB")
    
    # GPU Memory
    gpus = GPUtil.getGPUs()
    if gpus:
        gpu = gpus[0]  # Assuming we're using the first GPU
        print(f"GPU Memory: {gpu.memoryUsed:.2f} MB / {gpu.memoryTotal:.2f} MB")
    else:
        print("No GPU found")

# 1. Print initial memory usage
print("Initial Memory Usage:")
print_memory_usage()
print()

# 2. Load model into CPU memory
print("Loading model into CPU memory...")
cpu_memory_before = psutil.virtual_memory().used
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
cpu_memory_after = psutil.virtual_memory().used

print(f"CPU Memory Used for Model: {(cpu_memory_after - cpu_memory_before) / (1024 ** 2):.2f} MB")
print()

# 3. Print memory usage after loading to CPU
print("Memory Usage after loading to CPU:")
print_memory_usage()
print()

# 4. Load model into GPU memory
if torch.cuda.is_available():
    print("Loading model into GPU memory...")
    gpus = GPUtil.getGPUs()
    gpu_memory_before = gpus[0].memoryUsed if gpus else 0
    
    model.cuda()
    torch.cuda.synchronize()  # Ensure the operation is complete
    
    gpus = GPUtil.getGPUs()
    gpu_memory_after = gpus[0].memoryUsed if gpus else 0
    
    print(f"GPU Memory Used for Model: {gpu_memory_after - gpu_memory_before:.2f} MB")
else:
    print("CUDA is not available. Skipping GPU memory loading.")
print()

# 5. Print memory usage after loading to GPU
print("Memory Usage after loading to GPU:")
print_memory_usage()
print()

# 6. Remove model from GPU or move back to CPU
remove_from_gpu = True  # Set this to False to move back to CPU instead

if remove_from_gpu:
    print("Removing model from GPU memory...")
    del model
    torch.cuda.empty_cache()
else:
    print("Moving model back to CPU and removing from GPU memory...")
    model = model.cpu()
    torch.cuda.empty_cache()


print("sleeping for 5 seconds")
import time
time.sleep(5)
# 7. Print final memory usage
print("Final Memory Usage:")
print_memory_usage()

Initial Memory Usage:
CPU Memory: 4.57 GB / 31.27 GB
GPU Memory: 550.00 MB / 24576.00 MB

Loading model into CPU memory...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU Memory Used for Model: 14223.23 MB

Memory Usage after loading to CPU:
CPU Memory: 18.47 GB / 31.27 GB
GPU Memory: 548.00 MB / 24576.00 MB

Loading model into GPU memory...
GPU Memory Used for Model: 14870.00 MB

Memory Usage after loading to GPU:
CPU Memory: 4.76 GB / 31.27 GB
GPU Memory: 15422.00 MB / 24576.00 MB

Removing model from GPU memory...
sleeping for 5 seconds
Final Memory Usage:
CPU Memory: 4.79 GB / 31.27 GB
GPU Memory: 813.00 MB / 24576.00 MB


In [2]:
import copy

# 1. Print initial memory usage
print("Initial Memory Usage:")
print_memory_usage()
print()

# 2. Load model into CPU memory
print("Loading model into CPU memory...")
cpu_memory_before = psutil.virtual_memory().used
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
cpu_memory_after = psutil.virtual_memory().used

print(f"CPU Memory Used for Model: {(cpu_memory_after - cpu_memory_before) / (1024 ** 2):.2f} MB")
print()

# 3. Print memory usage after loading to CPU
print("Memory Usage after loading to CPU:")
print_memory_usage()
print()

def apply_uniform_quantization(model, num_bits):
    def quantize_uniform(tensor, num_bits):
        qmin, qmax = 0, 2**num_bits - 1
        scale = (tensor.max() - tensor.min()) / (qmax - qmin)
        zero_point = qmin - torch.round(tensor.min() / scale)
        
        quantized = torch.clamp(torch.round(tensor / scale + zero_point), qmin, qmax)
        dequantized = (quantized - zero_point) * scale
        
        return dequantized

    def quantize_layer(layer):
        if isinstance(layer, nn.Linear):
            layer.weight.data = quantize_uniform(layer.weight.data, num_bits)
            if layer.bias is not None:
                layer.bias.data = quantize_uniform(layer.bias.data, num_bits)
        return layer

    return model.apply(quantize_layer)

# Apply uniform quantization to the copied model
model = apply_uniform_quantization(model, num_bits=6)  # You can adjust num_bits as needed



# 3. Print memory usage after loading to CPU
print("--")
print("Memory Usage after Cloning Model:")
print_memory_usage()
print()


# 4. Load model into GPU memory
if torch.cuda.is_available():
    print("Loading model into GPU memory...")
    gpus = GPUtil.getGPUs()
    gpu_memory_before = gpus[0].memoryUsed if gpus else 0
    
    model.cuda()
    torch.cuda.synchronize()  # Ensure the operation is complete
    
    gpus = GPUtil.getGPUs()
    gpu_memory_after = gpus[0].memoryUsed if gpus else 0
    
    print(f"GPU Memory Used for Model: {gpu_memory_after - gpu_memory_before:.2f} MB")
else:
    print("CUDA is not available. Skipping GPU memory loading.")
print()

# 5. Print memory usage after loading to GPU
print("Memory Usage after loading to GPU:")
print_memory_usage()
print()

# 6. Remove model from GPU or move back to CPU
remove_from_gpu = True  # Set this to False to move back to CPU instead

if remove_from_gpu:
    print("Removing model from GPU memory...")
    del model
    torch.cuda.empty_cache()
else:
    print("Moving model back to CPU and removing from GPU memory...")
    model = model.cpu()
    torch.cuda.empty_cache()


print("sleeping for 5 seconds")
import time
time.sleep(5)
# 7. Print final memory usage
print("Final Memory Usage:")
print_memory_usage()

Initial Memory Usage:
CPU Memory: 4.80 GB / 31.27 GB
GPU Memory: 813.00 MB / 24576.00 MB

Loading model into CPU memory...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU Memory Used for Model: 14162.66 MB

Memory Usage after loading to CPU:
CPU Memory: 18.63 GB / 31.27 GB
GPU Memory: 795.00 MB / 24576.00 MB

--
Memory Usage after Cloning Model:
CPU Memory: 18.88 GB / 31.27 GB
GPU Memory: 797.00 MB / 24576.00 MB

Loading model into GPU memory...
GPU Memory Used for Model: 14614.00 MB

Memory Usage after loading to GPU:
CPU Memory: 9.67 GB / 31.27 GB
GPU Memory: 15411.00 MB / 24576.00 MB

Removing model from GPU memory...
sleeping for 5 seconds
Final Memory Usage:
CPU Memory: 9.34 GB / 31.27 GB
GPU Memory: 801.00 MB / 24576.00 MB


In [None]:
import copy

# 1. Print initial memory usage
print("Initial Memory Usage:")
print_memory_usage()
print()

# 2. Load model into CPU memory
print("Loading model into CPU memory...")
cpu_memory_before = psutil.virtual_memory().used
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
cpu_memory_after = psutil.virtual_memory().used

print(f"CPU Memory Used for Model: {(cpu_memory_after - cpu_memory_before) / (1024 ** 2):.2f} MB")
print()

# 3. Print memory usage after loading to CPU
print("Memory Usage after loading to CPU:")
print_memory_usage()
print()

def apply_uniform_quantization(model, num_bits):
    def quantize_uniform(tensor, num_bits):
        qmin, qmax = 0, 2**num_bits - 1
        scale = (tensor.max() - tensor.min()) / (qmax - qmin)
        zero_point = qmin - torch.round(tensor.min() / scale)
        
        quantized = torch.clamp(torch.round(tensor / scale + zero_point), qmin, qmax)
        dequantized = (quantized - zero_point) * scale
        
        return dequantized

    def quantize_layer(layer):
        if isinstance(layer, nn.Linear):
            layer.weight.data = quantize_uniform(layer.weight.data, num_bits)
            if layer.bias is not None:
                layer.bias.data = quantize_uniform(layer.bias.data, num_bits)
        return layer

    return model.apply(quantize_layer)

# Apply uniform quantization to the copied model
model = apply_uniform_quantization(model, num_bits=6)  # You can adjust num_bits as needed



# 3. Print memory usage after loading to CPU
print("--")
print("Memory Usage after Cloning Model:")
print_memory_usage()
print()


# 4. Load model into GPU memory
if torch.cuda.is_available():
    print("Loading model into GPU memory...")
    gpus = GPUtil.getGPUs()
    gpu_memory_before = gpus[0].memoryUsed if gpus else 0
    
    model.cuda()
    torch.cuda.synchronize()  # Ensure the operation is complete
    
    gpus = GPUtil.getGPUs()
    gpu_memory_after = gpus[0].memoryUsed if gpus else 0
    
    print(f"GPU Memory Used for Model: {gpu_memory_after - gpu_memory_before:.2f} MB")
else:
    print("CUDA is not available. Skipping GPU memory loading.")
print()

# 5. Print memory usage after loading to GPU
print("Memory Usage after loading to GPU:")
print_memory_usage()
print()

# 6. Remove model from GPU or move back to CPU
remove_from_gpu = True  # Set this to False to move back to CPU instead

if remove_from_gpu:
    print("Removing model from GPU memory...")
    del model
    torch.cuda.empty_cache()
else:
    print("Moving model back to CPU and removing from GPU memory...")
    model = model.cpu()
    torch.cuda.empty_cache()


print("sleeping for 5 seconds")
import time
time.sleep(5)
# 7. Print final memory usage
print("Final Memory Usage:")
print_memory_usage()