In [1]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, DistributedSampler
import torchvision
import lightning


  Referenced from: <DAC8FDCB-770B-356E-BA9C-E2F40A2AA20E> /opt/anaconda3/lib/python3.9/site-packages/torchvision/image.so
  Expected in:     <AE6DCE26-A528-35ED-BB3D-88890D27E6B9> /opt/anaconda3/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")


In [3]:
os.path.join(
    os.path.dirname(os.getcwd()),
    "Final/Data/CIFAR10/cifar-10-batches-py",
    "train_data.pkl",
)

'/Users/adityatandon/Documents/VS Code/Deep_Learning/Final/Data/CIFAR10/cifar-10-batches-py/train_data.pkl'

In [4]:
# data_dir = os.path.join(os.path.dirname(os.getcwd()), "Data/CIFAR10/cifar-10-batches-py/data_batch_1")
data_dir = os.path.join(
    os.path.dirname(os.getcwd()),
    "Final/Data/CIFAR10/cifar-10-batches-py",
    "train_data.pkl",
)
data_dir_val = os.path.join(
    os.path.dirname(os.getcwd()), "Final/Data/CIFAR10/cifar-10-batches-py/test_batch"
)
# batch_size = 2
num_in_channels = 3
dim_z = 16
kern_size = 4
lr = 2e-4
num_epochs = 3

num_disc_feat = 32
disc_stride = 1
disc_padding_size = 0

num_gen_feat = 32
gen_stride = 3
gen_padding_size = 1

mean = 0
std = 0.02  # from the DCGAN paper

if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"
else:
    device = "cpu"

device = "cpu"

In [58]:
def unpickle(file):
    with open(file, "rb") as fo:
        dict = pickle.load(fo, encoding="bytes")
    return dict

In [59]:
def process_data(folder_dir):
    file_paths = os.listdir(folder_dir)
    combined_img_data = []
    combined_label_data = []
    for file in file_paths:
        if "data_batch" in file:
            combined_img_data.append(
                unpickle(os.path.join(folder_dir, file))[b"data"]
            )
            combined_label_data.append(
                unpickle(os.path.join(folder_dir, file))[b"labels"]
            )
    combined_img_data = np.concatenate(combined_img_data)
    combined_label_data = np.concatenate(combined_label_data)
    # with open(os.path.join(folder_dir, "train_img_data.pkl"), 'wb') as f:
    #     pickle.dump(combined_img_data, f)
    # with open(os.path.join(folder_dir, "train_label_data.pkl"), 'wb') as f:
    #     pickle.dump(combined_label_data, f)

    train_data = {}
    train_data[b"data"] = combined_img_data
    train_data[b"labels"] = combined_label_data
    with open(os.path.join(folder_dir, "train_data.pkl"), "wb") as f:
        pickle.dump(train_data, f)

In [None]:
process_data(
    os.path.join(
        os.path.dirname(os.getcwd()), "Data/CIFAR10/cifar-10-batches-py"
    )
)

In [None]:
(unpickle(data_dir).keys())


In [61]:
class TrainDataset(Dataset):
    def __init__(self, data_dir):
        super().__init__()
        self.train_data = unpickle(data_dir)

    def __getitem__(self, idx):
        img = torch.tensor(
            self.train_data[b"data"][idx].reshape(3, 32, 32),
            dtype=torch.float32,
        )
        label = torch.tensor(
            self.train_data[b"labels"][idx], dtype=torch.float32
        )
        # return img, label
        return {"img": img, "label": label}

    def __len__(self):
        return len(self.train_data[b"labels"])

In [62]:
class ValDataset(Dataset):
    def __init__(self, data_dir):
        super().__init__()
        self.val_data = unpickle(data_dir)

    def __getitem__(self, idx):
        img = torch.tensor(
            self.val_data[b"data"][idx].reshape(3, 32, 32), dtype=torch.float32
        )
        label = torch.tensor(self.val_data[b"labels"][idx], dtype=torch.float32)
        # return img, label
        return {"img": img, "label": label}

    def __len__(self):
        return len(self.val_data[b"labels"])

In [63]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        # self.ngpu = ngpu
        self.net = nn.Sequential(
            # nn.BatchNorm2d(num_in_channels),
            nn.Conv2d(
                num_in_channels,
                num_disc_feat,
                kern_size,
                disc_stride,
                disc_padding_size,
                bias=False,
            ),
            nn.LeakyReLU(),
            nn.Conv2d(
                num_disc_feat,
                num_disc_feat * 2,
                kern_size,
                4,
                disc_padding_size,
                bias=False,
            ),
            nn.BatchNorm2d(num_disc_feat * 2),
            nn.LeakyReLU(),
            nn.Conv2d(
                num_disc_feat * 2,
                num_disc_feat * 4,
                kern_size,
                disc_stride,
                disc_padding_size,
                bias=False,
            ),
            nn.BatchNorm2d(num_disc_feat * 4),
            nn.LeakyReLU(),
            nn.Conv2d(
                num_disc_feat * 4,
                1,
                kern_size,
                disc_stride,
                disc_padding_size,
                bias=False,
            ),
            nn.BatchNorm2d(1),
            nn.LeakyReLU()
        )

        for layer in self.net.modules():
            if isinstance(layer, nn.Conv2d):
                nn.init.normal_(layer.weight, mean, std)

    def forward(self, x):
        z = self.net(x)
        return z

In [64]:
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        # self.ngpu = ngpu
        self.net = nn.Sequential(
            nn.ConvTranspose2d(
                dim_z, num_gen_feat * 4, kern_size, 1, 0, bias=False
            ),
            # ConvTranspose2d are the fractionally strided convolutions used for upscaling mentioned in the paper
            nn.BatchNorm2d(num_gen_feat * 4),
            nn.ReLU(),
            nn.ConvTranspose2d(
                num_gen_feat * 4, num_gen_feat * 2, kern_size, 2, 1, bias=False
            ),
            nn.BatchNorm2d(num_gen_feat * 2),
            nn.ReLU(),
            nn.ConvTranspose2d(
                num_gen_feat * 2, num_gen_feat, kern_size, 2, 1, bias=False
            ),
            nn.BatchNorm2d(num_gen_feat),
            nn.ReLU(),
            nn.ConvTranspose2d(
                num_gen_feat, num_in_channels, kern_size, 2, 1, bias=False
            ),
            nn.Tanh(),
        )

        for layer in self.net.modules():
            if isinstance(layer, nn.ConvTranspose2d):
                nn.init.normal_(layer.weight, mean, std)

    def forward(self, x):
        z = self.net(x)
        return z

In [65]:
train_dataset = TrainDataset(data_dir)
train_sampler = DistributedSampler(train_dataset, num_replicas=1, rank=0)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=100,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    sampler=train_sampler,
)

val_dataset = ValDataset(data_dir_val)
val_sampler = DistributedSampler(val_dataset, num_replicas=1, rank=0)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=100,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    sampler=val_sampler,
)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/adityatandon/Documents/VS Code/Deep_Learning/Data/CIFAR10/cifar-10-batches-py/test_batch'

In [None]:
def train(num_epochs, data_dir, data_dir_val, device, train=True, **kwargs):

    loss_fn = torch.nn.BCEWithLogitsLoss()

    train_dataset = TrainDataset(data_dir)
    train_sampler = DistributedSampler(train_dataset, num_replicas=1, rank=0)
    batch_size = kwargs.get("batch_size", 100)
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        sampler=train_sampler,
    )
    if train == False:
        val_dataset = ValDataset(data_dir_val)
        val_sampler = DistributedSampler(val_dataset, num_replicas=1, rank=0)
        val_loader = DataLoader(
            dataset=val_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=True,
            sampler=val_sampler,
        )

    optimG = torch.optim.Adam(gen.parameters(), lr=2e-4, betas=(0.5, 0.0))
    optimD = torch.optim.Adam(disc.parameters(), lr=2e-4, betas=(0.5, 0.0))
    # optimD = torch.optim.SGD(disc.parameters(), lr=2e-4)

    # noise = torch.randn(28, dim_z, 1, 1, device=device)

    # real_label = 1
    # fake_label = 0
    loss_d, loss_g = [], []
    
    if train:
        for i in range(num_epochs):
            print(f"Epoch no. :  {i+1}")
            for j, mini_batch in enumerate(train_loader):
                optimG.zero_grad()

                for _ in range(3):
                    optimD.zero_grad()

                    # train disc first
                    x_real = mini_batch["img"].to(device)
                    x_real = x_real.view(batch_size, num_in_channels, 32, 32)
                    labels = torch.full(
                        (batch_size,), 1.0, dtype=torch.float32, device=device
                    )
                    # out_disc_r = disc(x_real).flatten().sigmoid()
                    out_disc_r = disc(x_real).flatten()

                    # print(out_disc_r)
                    loss_disc_r = loss_fn(out_disc_r, labels)
                    loss_disc_r.backward()

                    x_fake = gen(torch.randn(batch_size, dim_z, 1, 1))
                    labels = torch.full(
                        (batch_size,), 0.0, dtype=torch.float32, device=device
                    )
                    # out_disc_f = disc(x_fake).flatten().sigmoid()
                    out_disc_f = disc(x_fake).flatten()
                    loss_disc_f = loss_fn(out_disc_f, labels)
                    loss_disc_f.backward()

                    loss_disc = (
                        loss_disc_f.mean().item() + loss_disc_r.mean().item()
                    ) / 2.0
                    
                    optimD.step()

                # train gen
                x_gen = gen(torch.randn(batch_size, dim_z, 1, 1))
                # out = disc(x_gen).flatten().sigmoid()
                out = disc(x_gen).flatten()
                labels = torch.full(
                    (batch_size,), 1.0, dtype=torch.float32, device=device
                )
                loss_gen = loss_fn(out, labels)
                loss_gen.backward()

                loss_d.append(loss_disc)
                loss_g.append(loss_gen.item())

                if j % 200 == 0:
                    print(f"Discriminator loss after {j} steps = {loss_d[j + i*train_dataset.__len__()//batch_size]}")
                    print(f"Generator loss after {j} steps = {loss_g[j + i*train_dataset.__len__()//batch_size]}")

                
                optimG.step()
                
    return loss_d, loss_g

In [8]:
disc = Discriminator().to(device)
gen = Generator().to(device)


In [None]:
loss_d, loss_g = train(3, data_dir=data_dir,data_dir_val=data_dir_val, device=device, batch_size=50)

In [None]:
plt.plot(loss_d)
plt.plot(loss_g)
plt.legend(["Discriminator loss", "Generator loss"])
plt.show()
# plt., loss_d[0]


In [None]:
with torch.no_grad():
    gen_img = gen(torch.randn(10, dim_z, 1, 1))
    print(gen_img.shape)
    plt.imshow(gen_img[1].reshape(32, 32, 3).detach().numpy())
    print(disc(gen_img).flatten())

In [None]:
loss_fn = torch.nn.BCELoss()

train_dataset = TrainDataset(data_dir)
train_sampler = DistributedSampler(train_dataset, num_replicas=1, rank=0)
batch_size = 50
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    sampler=train_sampler,
)
if train == False:
    val_dataset = ValDataset(data_dir_val)
    val_sampler = DistributedSampler(val_dataset, num_replicas=1, rank=0)
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        sampler=val_sampler,
    )

optimG = torch.optim.Adam(gen.parameters(), lr=2e-4, betas=(0.5, 0.0))
optimD = torch.optim.Adam(disc.parameters(), lr=2e-4, betas=(0.5, 0.0))

# for j, mini_batch in enumerate(train_loader):
mini_batch = next(iter(train_loader))
optimG.zero_grad()

for _ in range(3):
    optimD.zero_grad()

    # train disc first
    x_real = mini_batch["img"].to(device)
    x_real = x_real.view(batch_size, num_in_channels, 32, 32)
    labels = torch.full(
        (batch_size,), 1.0, dtype=torch.float32, device=device
    )
    # out_disc_r = disc(x_real).flatten().sigmoid()
    out_disc_r = disc(x_real).flatten()

    # print(out_disc_r)
    loss_disc_r = loss_fn(out_disc_r, labels)
    loss_disc_r.backward()

    x_fake = gen(torch.randn(batch_size, dim_z, 1, 1))
    labels = torch.full(
        (batch_size,), 0.0, dtype=torch.float32, device=device
    )
    # out_disc_f = disc(x_fake).flatten().sigmoid()
    out_disc_f = disc(x_fake).flatten()
    loss_disc_f = loss_fn(out_disc_f, labels)
    loss_disc_f.backward()

    loss_disc = (
        loss_disc_f.mean().item() + loss_disc_r.mean().item()
    ) / 2.0
    if _ != 2:
        optimD.step()

# train gen
x_gen = gen(torch.randn(batch_size, dim_z, 1, 1))
# out = disc(x_gen).flatten().sigmoid()
out = disc(x_gen).flatten()
labels = torch.full(
    (batch_size,), 1.0, dtype=torch.float32, device=device
)
loss_gen = loss_fn(out, labels)
loss_gen.backward()


# optimG.step()

In [None]:
def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    torch.distributed.init_process_group(
        backend="nccl",
        init_method="env://",
        world_size=args.world_size,
        rank=rank,
    )

    torch.manual_seed(0)
    torch.cuda.set_device(gpu)
    model = test_net().cuda(gpu)

    batch_size = 100

    loss_fn = torch.nn.BCELoss()
    optim = torch.optim.Adam(model.parameters(), lr=1e4)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
    # model = DDP(model)

    total_steps = len(train_loader)
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            if (i + 1) % 100 == 0 and gpu == 0:
                print(
                    "Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(
                        epoch + 1, args.epochs, i + 1, total_steps, loss.item()
                    )
                )