In [1]:
import os

# The default path for Kaggle input files
input_path = '/kaggle/input/'

# List all items in the input directory and filter for folders
folders = [folder for folder in os.listdir(input_path) 
           if os.path.isdir(os.path.join(input_path, folder))]

# Print the list of folders
print(folders)

['indian-monuments-image-dataset', 'snake-dataset-india', 'indian-paintings-dataset', 'top-500-indian-cities', 'indian-signboard-image-dataset', 'indian-currency-notes-classifier', 'postage-stamp-data-set', 'indian-actor-images-dataset', 'indian-food-images-dataset', 'indian-currency-note-images-dataset-2020', 'indian-cricketers-images', 'indian-classical-musical-instruments', 'bird-species-classification', 'india-famous-personalities-image-dataset', 'indian-dance-images', '200-bird-species-with-11788-images']


In [2]:
import os
import shutil
from tqdm import tqdm

# --- Configuration ---
# 1. Define the base path for Kaggle input
BASE_INPUT_PATH = '/kaggle/input/'

# 2. List of the source folders you want to combine
SOURCE_FOLDERS = [
    'india-famous-personalities-image-dataset', 'indian-classical-musical-instruments',
     'indian-dance-images',
    'indian-monuments-image-dataset', 'indian-currency-note-images-dataset-2020',
    'postage-stamp-data-set',
    'indian-currency-notes-classifier', 'indian-food-images-dataset', 'top-500-indian-cities', 'indian-paintings-dataset'
]

# 3. Define the destination for your new, combined dataset
#    Kaggle allows you to write to the '/kaggle/working/' directory
DESTINATION_PATH = '/kaggle/working/combined_indian_dataset/'

# --- Main Script ---
print(f"Starting the dataset combination process...")

# Create the destination folder if it doesn't exist
os.makedirs(DESTINATION_PATH, exist_ok=True)
print(f"Destination folder created at: {DESTINATION_PATH}")

image_counter = 0
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']

# Loop through each source folder
for folder_name in tqdm(SOURCE_FOLDERS, desc="Processing Folders"):
    source_path = os.path.join(BASE_INPUT_PATH, folder_name)
    
    # Check if the source directory actually exists
    if not os.path.isdir(source_path):
        print(f"Warning: Folder '{folder_name}' not found. Skipping.")
        continue

    # os.walk() recursively finds all files in all subdirectories
    for dirpath, _, filenames in os.walk(source_path):
        for filename in filenames:
            # Check if the file is an image
            if any(filename.lower().endswith(ext) for ext in image_extensions):
                
                # Construct the full path of the source image
                source_file_path = os.path.join(dirpath, filename)
                
                # Get the original file extension (e.g., '.jpg')
                original_extension = os.path.splitext(filename)[1]
                
                # Create a new, unique filename to avoid overwrites
                new_filename = f"image_{image_counter:06d}{original_extension}"
                destination_file_path = os.path.join(DESTINATION_PATH, new_filename)
                
                # Copy the file to the new destination
                shutil.copy2(source_file_path, destination_file_path)
                
                # Increment the counter
                image_counter += 1

print("\n-------------------------------------------------")
print("✅ Dataset combination complete!")
print(f"Total images copied: {image_counter}")
print(f"Your new dataset is ready at: {DESTINATION_PATH}")
print("-------------------------------------------------")

Starting the dataset combination process...
Destination folder created at: /kaggle/working/combined_indian_dataset/


Processing Folders: 100%|██████████| 10/10 [03:31<00:00, 21.12s/it]


-------------------------------------------------
✅ Dataset combination complete!
Total images copied: 21813
Your new dataset is ready at: /kaggle/working/combined_indian_dataset/
-------------------------------------------------





In [3]:
import os
import shutil
from tqdm import tqdm

# The path to your combined dataset
base_path = '/kaggle/working/combined_indian_dataset/'

# The path for the new sub-folder that ImageFolder expects
new_class_folder_path = os.path.join(base_path, 'images')

# Create the new sub-folder
print(f"Creating sub-folder at: {new_class_folder_path}")
os.makedirs(new_class_folder_path, exist_ok=True)

# Find all files in the base directory
files_to_move = [f for f in os.listdir(base_path) if os.path.isfile(os.path.join(base_path, f))]

print(f"Found {len(files_to_move)} images to move...")

# Move each file into the new sub-folder
for filename in tqdm(files_to_move, desc="Organizing files"):
    source = os.path.join(base_path, filename)
    destination = os.path.join(new_class_folder_path, filename)
    shutil.move(source, destination)

print("\n✅ All files have been moved successfully!")
print("You can now re-run your training script.")

Creating sub-folder at: /kaggle/working/combined_indian_dataset/images
Found 21813 images to move...


Organizing files: 100%|██████████| 21813/21813 [00:00<00:00, 36705.66it/s]


✅ All files have been moved successfully!
You can now re-run your training script.





In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import math
from tqdm import tqdm
import torchvision
import os

# ======================================================================================
# PART 1: THE VAE ARCHITECTURE (THE COMPRESSOR) - (No changes needed here)
# ======================================================================================
class VAE(nn.Module):
    def __init__(self, in_channels=3, latent_dim=128):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        
        modules = []
        hidden_dims = [32, 64, 128, 256]
        for h_dim in hidden_dims:
            modules.append(
                nn.Sequential(
                    nn.Conv2d(in_channels, out_channels=h_dim,
                              kernel_size=3, stride=2, padding=1),
                    nn.BatchNorm2d(h_dim),
                    nn.LeakyReLU())
            )
            in_channels = h_dim
        self.encoder = nn.Sequential(*modules)
        
        self.fc_mu = nn.Linear(hidden_dims[-1]*16, latent_dim)
        self.fc_var = nn.Linear(hidden_dims[-1]*16, latent_dim)

        modules = []
        self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1] * 16)
        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.ConvTranspose2d(hidden_dims[i],
                                       hidden_dims[i + 1],
                                       kernel_size=3, stride=2,
                                       padding=1, output_padding=1),
                    nn.BatchNorm2d(hidden_dims[i + 1]),
                    nn.LeakyReLU())
            )
        self.decoder = nn.Sequential(*modules)
        
        self.final_layer = nn.Sequential(
                            nn.ConvTranspose2d(hidden_dims[-1],
                                               hidden_dims[-1],
                                               kernel_size=3, stride=2,
                                               padding=1, output_padding=1),
                            nn.BatchNorm2d(hidden_dims[-1]),
                            nn.LeakyReLU(),
                            nn.Conv2d(hidden_dims[-1], out_channels=3,
                                      kernel_size=3, padding=1),
                            nn.Tanh())

    def encode(self, x):
        spatial_latent = self.encoder(x)
        result = torch.flatten(spatial_latent, start_dim=1)
        mu = self.fc_mu(result)
        log_var = self.fc_var(result)
        return mu, log_var, spatial_latent

    def decode(self, z):
        result = self.decoder_input(z)
        result = result.view(-1, 256, 4, 4)
        result = self.decoder(result)
        result = self.final_layer(result)
        return result

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, log_var, _ = self.encode(x)
        z = self.reparameterize(mu, log_var)
        reconstruction = self.decode(z)
        return reconstruction, mu, log_var

# ======================================================================================
# PART 2: THE U-NET ARCHITECTURE (THE DENOISER) - (No changes needed here)
# ======================================================================================
class TimeEmbedding(nn.Module):
    def __init__(self, n_channels: int):
        super().__init__()
        self.n_channels = n_channels
    def forward(self, t: torch.Tensor):
        half_dim = self.n_channels // 2
        exponents = torch.arange(half_dim, device=t.device).float() / (half_dim - 1)
        embeddings = torch.exp(-math.log(10000) * exponents)
        embeddings = t[:, None] * embeddings[None, :]
        return torch.cat([embeddings.sin(), embeddings.cos()], dim=-1)

class ResidualBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, time_channels: int):
        super().__init__()
        self.norm1 = nn.GroupNorm(32, in_channels)
        self.act1 = nn.SiLU()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.norm2 = nn.GroupNorm(32, out_channels)
        self.act2 = nn.SiLU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1) if in_channels != out_channels else nn.Identity()
        self.time_emb = nn.Linear(time_channels, out_channels)
    def forward(self, x: torch.Tensor, t: torch.Tensor):
        h = self.conv1(self.act1(self.norm1(x)))
        time_emb_proj = self.time_emb(self.act2(t))
        h = h + time_emb_proj[:, :, None, None]
        h = self.conv2(self.act2(self.norm2(h)))
        return h + self.shortcut(x)

class AttentionBlock(nn.Module):
    def __init__(self, n_channels: int):
        super().__init__()
        self.norm = nn.GroupNorm(32, n_channels)
        self.qkv = nn.Conv2d(n_channels, n_channels * 3, kernel_size=1)
        self.proj_out = nn.Conv2d(n_channels, n_channels, kernel_size=1)
    def forward(self, x: torch.Tensor):
        b, c, h, w = x.shape
        h_ = self.norm(x)
        qkv = self.qkv(h_)
        q, k, v = qkv.chunk(3, dim=1)
        q = q.view(b, c, h * w); k = k.view(b, c, h * w); v = v.view(b, c, h * w)
        attn = torch.einsum('bci,bcj->bij', q, k) * (c ** -0.5)
        attn = attn.softmax(dim=-1)
        out = torch.einsum('bij,bcj->bci', attn, v)
        out = out.view(b, c, h, w)
        return x + self.proj_out(out)

class UNet(nn.Module):
    def __init__(self, in_channels=256, out_channels=256, n_channels=320):
        super().__init__()
        time_emb_dim = n_channels * 4
        self.time_embedding = TimeEmbedding(time_emb_dim)
        self.inc = nn.Conv2d(in_channels, n_channels, kernel_size=3, padding=1)
        self.down1_res = ResidualBlock(n_channels, n_channels, time_emb_dim)
        self.down1_attn = AttentionBlock(n_channels)
        self.down2_conv = nn.Conv2d(n_channels, n_channels * 2, kernel_size=3, stride=2, padding=1)
        self.down2_res = ResidualBlock(n_channels * 2, n_channels * 2, time_emb_dim)
        self.down2_attn = AttentionBlock(n_channels * 2)
        self.bot_res1 = ResidualBlock(n_channels * 2, n_channels * 2, time_emb_dim)
        self.bot_attn = AttentionBlock(n_channels * 2)
        self.bot_res2 = ResidualBlock(n_channels * 2, n_channels * 2, time_emb_dim)
        self.up1_res = ResidualBlock(n_channels * 4, n_channels * 2, time_emb_dim)
        self.up1_attn = AttentionBlock(n_channels * 2)
        self.up1_conv_transpose = nn.ConvTranspose2d(n_channels * 2, n_channels, kernel_size=2, stride=2)
        self.up2_res = ResidualBlock(n_channels * 2, n_channels, time_emb_dim)
        self.up2_attn = AttentionBlock(n_channels)
        self.outc = nn.Sequential(nn.GroupNorm(32, n_channels), nn.SiLU(), nn.Conv2d(n_channels, out_channels, kernel_size=3, padding=1))

    def forward(self, x: torch.Tensor, t: torch.Tensor):
        t_emb = self.time_embedding(t)
        x1 = self.inc(x)
        x2 = self.down1_res(x1, t_emb); x2 = self.down1_attn(x2)
        x3_conv = self.down2_conv(x2)
        x3 = self.down2_res(x3_conv, t_emb); x3 = self.down2_attn(x3)
        x_bot = self.bot_res1(x3, t_emb); x_bot = self.bot_attn(x_bot); x_bot = self.bot_res2(x_bot, t_emb)
        x = torch.cat([x_bot, x3], dim=1)
        x = self.up1_res(x, t_emb); x = self.up1_attn(x); x = self.up1_conv_transpose(x)
        x = torch.cat([x, x2], dim=1)
        x = self.up2_res(x, t_emb); x = self.up2_attn(x)
        return self.outc(x)

# ======================================================================================
# PART 3: THE DRIVER / CONTROLLER (THE LOGIC) - (Major changes here)
# ======================================================================================
class DiffusionTrainer:
    ### CHANGE: Added epochs to __init__ to configure the scheduler
    def __init__(self, unet_model, vae_model, timesteps=1000, device='cpu', lr=1e-4, epochs=50):
        self.device = device
        self.unet_model = unet_model.to(device)
        self.vae_model = vae_model.to(device)
        self.timesteps = timesteps

        for param in self.vae_model.parameters():
            param.requires_grad = False
        
        self.betas = self._linear_beta_schedule(timesteps).to(device)
        self.alphas = 1. - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, axis=0)
        
        ### CHANGE: Switched to AdamW, a better optimizer
        self.optimizer = torch.optim.AdamW(self.unet_model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

        ### CHANGE: Added a learning rate scheduler for better convergence
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=epochs)

        ### CHANGE: Added a gradient scaler for automatic mixed precision (faster training)
        self.scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))


    def _linear_beta_schedule(self, timesteps):
        beta_start = 0.0001
        beta_end = 0.02
        return torch.linspace(beta_start, beta_end, timesteps)

    def _get_index_from_list(self, vals, t, x_shape):
        batch_size = t.shape[0]
        out = vals.gather(-1, t) 
        return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device)

    def train_step(self, real_images):
        self.optimizer.zero_grad()
        
        # Use autocast for mixed precision to speed up training on GPU
        with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
            _, _, x_0 = self.vae_model.encode(real_images)
            t = torch.randint(0, self.timesteps, (x_0.shape[0],), device=self.device).long()
            noise = torch.randn_like(x_0)
            
            alphas_cumprod_t = self._get_index_from_list(self.alphas_cumprod, t, x_0.shape)
            
            noisy_latent = torch.sqrt(alphas_cumprod_t) * x_0 + torch.sqrt(1. - alphas_cumprod_t) * noise
            predicted_noise = self.unet_model(noisy_latent, t)
            loss = self.criterion(noise, predicted_noise)

        ### CHANGE: Use the scaler for the backward pass
        self.scaler.scale(loss).backward()
        self.scaler.step(self.optimizer)
        self.scaler.update()
        
        return loss.item()

    @torch.no_grad()
    def sample(self, num_images=4):
        latent_x = torch.randn((num_images, 256, 4, 4), device=self.device)
        
        for i in tqdm(reversed(range(0, self.timesteps)), desc="Sampling", total=self.timesteps):
            t = torch.full((num_images,), i, device=self.device, dtype=torch.long)
            predicted_noise = self.unet_model(latent_x, t)
            alpha_t = self._get_index_from_list(self.alphas, t, latent_x.shape)
            alphas_cumprod_t = self._get_index_from_list(self.alphas_cumprod, t, latent_x.shape)
            beta_t = self._get_index_from_list(self.betas, t, latent_x.shape)
            noise_term = ((1 - alpha_t) / torch.sqrt(1 - alphas_cumprod_t)) * predicted_noise
            latent_x = (1 / torch.sqrt(alpha_t)) * (latent_x - noise_term)
            if i > 0:
                z = torch.randn_like(latent_x)
                latent_x += torch.sqrt(beta_t) * z

        sampled_images = self.vae_model.decoder(latent_x)
        sampled_images = self.vae_model.final_layer(sampled_images)
        return sampled_images

# ======================================================================================
# PART 4: MAIN EXECUTION BLOCK (PUTTING IT ALL TOGETHER)
# ======================================================================================
if __name__ == '__main__':
    # --- Configuration ---
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    BATCH_SIZE = 8
    IMG_SIZE = 64
    ### CHANGE: Reduced epochs as requested. With faster training, 25 might be enough.
    EPOCHS = 15
    LEARNING_RATE = 2e-5 ### CHANGE: Slightly increased learning rate, good starting point with a scheduler
    DATASET_PATH = "/kaggle/working/combined_indian_dataset/"

    print(f"Using device: {DEVICE}")

    # --- Data Loading and Transformations ---
    transforms = torchvision.transforms.Compose([
        torchvision.transforms.Resize((IMG_SIZE, IMG_SIZE)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    dataset = torchvision.datasets.ImageFolder(root=DATASET_PATH, transform=transforms)
    
    ### CHANGE: Reduced num_workers to prevent CPU bottleneck and added pin_memory for speed
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    
    # --- Initialize Models ---
    vae = VAE(in_channels=3, latent_dim=128)
    # You should load your pre-trained VAE weights here if you have them
    # vae.load_state_dict(torch.load('path/to/your/vae.pth'))
    
    VAE_ENCODER_OUTPUT_CHANNELS = 256 
    unet = UNet(in_channels=VAE_ENCODER_OUTPUT_CHANNELS, out_channels=VAE_ENCODER_OUTPUT_CHANNELS)
    
    # --- Initialize the Driver/Controller ---
    trainer = DiffusionTrainer(unet, vae, timesteps=1000, device=DEVICE, lr=LEARNING_RATE, epochs=EPOCHS)

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        for batch_idx, (real_images, _) in enumerate(progress_bar):
            real_images = real_images.to(DEVICE)
            loss = trainer.train_step(real_images)
            total_loss += loss
            progress_bar.set_postfix({"Loss": f"{loss:.4f}", "LR": f"{trainer.scheduler.get_last_lr()[0]:.1e}"})
        
        ### CHANGE: Step the scheduler at the end of each epoch
        trainer.scheduler.step()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} finished. Average Loss: {avg_loss:.4f}")

        ### CHANGE: Save a comprehensive checkpoint instead of just the weights
        checkpoint = {
            'epoch': epoch + 1,
            'unet_state_dict': trainer.unet_model.state_dict(),
            'optimizer_state_dict': trainer.optimizer.state_dict(),
            'scheduler_state_dict': trainer.scheduler.state_dict(),
            'loss': avg_loss,
        }
        torch.save(checkpoint, f"checkpoint_epoch_{epoch+1}.pth")
        print(f"Saved checkpoint to checkpoint_epoch_{epoch+1}.pth")

    # --- Sampling ---
    print("Training finished. Starting sampling...")
    sampled_imgs = trainer.sample(num_images=4)
    grid = torchvision.utils.make_grid(sampled_imgs, nrow=4, normalize=True)
    torchvision.utils.save_image(grid, "final_generated_sample.png")
    print("Saved final generated images to 'final_generated_sample.png'")

Using device: cuda


  self.scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))
  with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
Epoch 1/15: 100%|██████████| 2725/2725 [04:28<00:00, 10.14it/s, Loss=0.7645, LR=2.0e-05]


Epoch 1 finished. Average Loss: 0.8485
Saved checkpoint to checkpoint_epoch_1.pth


Epoch 2/15: 100%|██████████| 2725/2725 [04:29<00:00, 10.13it/s, Loss=0.5801, LR=2.0e-05]


Epoch 2 finished. Average Loss: 0.6426
Saved checkpoint to checkpoint_epoch_2.pth


Epoch 3/15: 100%|██████████| 2725/2725 [04:09<00:00, 10.91it/s, Loss=0.5132, LR=1.9e-05]


Epoch 3 finished. Average Loss: 0.5458
Saved checkpoint to checkpoint_epoch_3.pth


Epoch 4/15: 100%|██████████| 2725/2725 [04:26<00:00, 10.24it/s, Loss=0.4231, LR=1.8e-05]


Epoch 4 finished. Average Loss: 0.4813
Saved checkpoint to checkpoint_epoch_4.pth


Epoch 5/15: 100%|██████████| 2725/2725 [04:27<00:00, 10.20it/s, Loss=0.4289, LR=1.7e-05]


Epoch 5 finished. Average Loss: 0.4360
Saved checkpoint to checkpoint_epoch_5.pth


Epoch 6/15: 100%|██████████| 2725/2725 [04:35<00:00,  9.88it/s, Loss=0.2805, LR=1.5e-05]


Epoch 6 finished. Average Loss: 0.4049
Saved checkpoint to checkpoint_epoch_6.pth


Epoch 7/15: 100%|██████████| 2725/2725 [04:29<00:00, 10.13it/s, Loss=0.3865, LR=1.3e-05]


Epoch 7 finished. Average Loss: 0.3778
Saved checkpoint to checkpoint_epoch_7.pth


Epoch 8/15: 100%|██████████| 2725/2725 [04:29<00:00, 10.10it/s, Loss=0.2440, LR=1.1e-05]


Epoch 8 finished. Average Loss: 0.3544
Saved checkpoint to checkpoint_epoch_8.pth


Epoch 9/15: 100%|██████████| 2725/2725 [04:31<00:00, 10.05it/s, Loss=0.2563, LR=9.0e-06]


Epoch 9 finished. Average Loss: 0.3380
Saved checkpoint to checkpoint_epoch_9.pth


Epoch 10/15: 100%|██████████| 2725/2725 [04:28<00:00, 10.16it/s, Loss=0.3077, LR=6.9e-06]


Epoch 10 finished. Average Loss: 0.3256
Saved checkpoint to checkpoint_epoch_10.pth


Epoch 11/15: 100%|██████████| 2725/2725 [04:27<00:00, 10.20it/s, Loss=0.3578, LR=5.0e-06]


Epoch 11 finished. Average Loss: 0.3135
Saved checkpoint to checkpoint_epoch_11.pth


Epoch 12/15: 100%|██████████| 2725/2725 [04:26<00:00, 10.22it/s, Loss=0.4096, LR=3.3e-06]


Epoch 12 finished. Average Loss: 0.3116
Saved checkpoint to checkpoint_epoch_12.pth


Epoch 13/15: 100%|██████████| 2725/2725 [04:31<00:00, 10.05it/s, Loss=0.1984, LR=1.9e-06]


Epoch 13 finished. Average Loss: 0.3059
Saved checkpoint to checkpoint_epoch_13.pth


Epoch 14/15: 100%|██████████| 2725/2725 [04:30<00:00, 10.09it/s, Loss=0.1993, LR=8.6e-07]


Epoch 14 finished. Average Loss: 0.3047
Saved checkpoint to checkpoint_epoch_14.pth


Epoch 15/15: 100%|██████████| 2725/2725 [04:31<00:00, 10.05it/s, Loss=0.3200, LR=2.2e-07]


Epoch 15 finished. Average Loss: 0.3027
Saved checkpoint to checkpoint_epoch_15.pth
Training finished. Starting sampling...


Sampling: 100%|██████████| 1000/1000 [00:08<00:00, 117.29it/s]


Saved final generated images to 'final_generated_sample.png'
