<a href="https://colab.research.google.com/github/Charan6924/Deep-Learning/blob/main/SuperResolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import pyarrow.parquet as pq
from torch.utils.data import IterableDataset, DataLoader
from google.colab import drive
import glob
import os
import torch.optim as optim
import random
import gc

In [25]:
all_files = [
    '/content/data/QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272_LR.parquet',
    '/content/data/QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540_LR.parquet',
    '/content/data/QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494_LR.parquet',]

In [32]:
class JetImageDataset(IterableDataset):
    def __init__(
        self,
        parquet_files,
        split='train',
        train_ratio=0.8,
        chunk_size=500,
        normalize=True
    ):
        self.parquet_files = parquet_files
        self.split = split
        self.threshold = int(train_ratio * 100)
        self.chunk_size = chunk_size
        self.normalize = normalize

        self.lr_col = 'X_jets_LR'
        self.hr_col = 'X_jets'
        self.lr_shape = (3, 64, 64)
        self.hr_shape = (3, 125, 125)

        print(f"✓ Dataset initialized for {split} split")
        print(f"  LR shape: {self.lr_shape}")
        print(f"  HR shape: {self.hr_shape}")
        print(f"  Chunk size: {chunk_size}")

    def _nested_to_tensor(self, nested_array, target_shape):
        stacked = np.stack(nested_array)
        flat = stacked.flatten()
        tensor = torch.from_numpy(flat.astype(np.float32))
        tensor = tensor.reshape(target_shape)
        if self.normalize:
            if tensor.max() > 1.0:
                tensor = tensor / 255.0
            elif tensor.max() > 10.0:
                tensor = tensor / tensor.max()

        return tensor

    def __iter__(self):
        global_idx = 0

        for file_path in self.parquet_files:
            print(f"Reading: {file_path}")
            parquet_file = pq.ParquetFile(file_path)

            chunk_num = 0
            for batch in parquet_file.iter_batches(
                batch_size=self.chunk_size,
                columns=[self.lr_col, self.hr_col]
            ):
                chunk_num += 1
                if chunk_num % 20 == 0:
                    print(f"  Chunk {chunk_num}... (row {global_idx})")

                df = batch.to_pandas()

                for idx in range(len(df)):
                    is_train = (global_idx % 100) < self.threshold

                    if (self.split == 'train' and is_train) or \
                       (self.split == 'val' and not is_train):

                        try:
                            row = df.iloc[idx]

                            lr = self._nested_to_tensor(row[self.lr_col], self.lr_shape)
                            hr = self._nested_to_tensor(row[self.hr_col], self.hr_shape)

                            yield lr, hr

                        except Exception as e:
                            if global_idx < 3:
                                print(f"Error at row {global_idx}: {e}")

                    global_idx += 1

                del df, batch
                gc.collect()

            print(f"✓ Finished file")

In [20]:
def collate_fn(batch):
    lr_batch = torch.stack([item[0] for item in batch])
    hr_batch = torch.stack([item[1] for item in batch])
    return lr_batch, hr_batch

In [28]:
train_dataset = JetImageSRDataset(
    parquet_files=all_files,
    split='train',
    train_ratio=0.8,
    chunk_size=500,
    normalize=True
)

val_dataset = JetImageSRDataset(
    parquet_files=all_files,
    split='val',
    train_ratio=0.8,
    chunk_size=500,
    normalize=True
)

train_loader = DataLoader(train_dataset, batch_size=32, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=0, pin_memory=True)


 Dataset initialized for train split
  LR column: X_jets_LR
  HR column: X_jets
  Chunk size: 500 rows
 Dataset initialized for val split
  LR column: X_jets_LR
  HR column: X_jets
  Chunk size: 500 rows


In [31]:
def diagnose_data_format(file_path, num_samples=3):
    """
    Check exactly how your data is stored.
    This will help us understand the format.
    """
    print("=" * 60)
    print("DIAGNOSING DATA FORMAT")
    print("=" * 60)

    pf = pq.ParquetFile(file_path)

    # Read just a few rows
    batch = pf.iter_batches(batch_size=num_samples, columns=['X_jets_LR', 'X_jets']).__next__()
    df = batch.to_pandas()

    print("\n1. Checking X_jets_LR (Low Resolution):")
    print("-" * 60)
    for i in range(min(num_samples, len(df))):
        lr_sample = df['X_jets_LR'].iloc[i]
        print(f"\n  Sample {i}:")
        print(f"    Type: {type(lr_sample)}")

        if isinstance(lr_sample, np.ndarray):
            print(f"    Shape: {lr_sample.shape}")
            print(f"    Dtype: {lr_sample.dtype}")
            if lr_sample.dtype == object:
                print(f"    First element type: {type(lr_sample.flat[0])}")
                if hasattr(lr_sample.flat[0], 'shape'):
                    print(f"    First element shape: {lr_sample.flat[0].shape}")
        elif isinstance(lr_sample, (list, tuple)):
            print(f"    Length: {len(lr_sample)}")
            print(f"    First element type: {type(lr_sample[0])}")
            if isinstance(lr_sample[0], (list, np.ndarray)):
                print(f"    First element length/shape: {len(lr_sample[0]) if isinstance(lr_sample[0], list) else lr_sample[0].shape}")

        # Try to show actual values
        try:
            arr = np.array(lr_sample, dtype=np.float32)
            print(f"    Successfully converted to shape: {arr.shape}")
            print(f"    Value range: [{arr.min():.4f}, {arr.max():.4f}]")
        except Exception as e:
            print(f"    ❌ Conversion error: {e}")
            # Try flattening
            try:
                if isinstance(lr_sample, np.ndarray) and lr_sample.dtype == object:
                    flat_list = [np.array(x).flatten() for x in lr_sample.flat]
                    arr = np.concatenate(flat_list)
                    print(f"    Flattened shape: {arr.shape}")
            except:
                pass

    print("\n2. Checking X_jets (High Resolution):")
    print("-" * 60)
    hr_sample = df['X_jets'].iloc[0]
    print(f"  Type: {type(hr_sample)}")
    if isinstance(hr_sample, np.ndarray):
        print(f"  Shape: {hr_sample.shape}")
        print(f"  Dtype: {hr_sample.dtype}")

    del df, batch
    gc.collect()

    print("\n" + "=" * 60)


file_path = '/content/data/QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272_LR.parquet'
diagnose_data_format(file_path, num_samples=3)

DIAGNOSING DATA FORMAT

1. Checking X_jets_LR (Low Resolution):
------------------------------------------------------------

  Sample 0:
    Type: <class 'numpy.ndarray'>
    Shape: (3,)
    Dtype: object
    First element type: <class 'numpy.ndarray'>
    First element shape: (64,)
    ❌ Conversion error: setting an array element with a sequence.
    Flattened shape: (192,)

  Sample 1:
    Type: <class 'numpy.ndarray'>
    Shape: (3,)
    Dtype: object
    First element type: <class 'numpy.ndarray'>
    First element shape: (64,)
    ❌ Conversion error: setting an array element with a sequence.
    Flattened shape: (192,)

  Sample 2:
    Type: <class 'numpy.ndarray'>
    Shape: (3,)
    Dtype: object
    First element type: <class 'numpy.ndarray'>
    First element shape: (64,)
    ❌ Conversion error: setting an array element with a sequence.
    Flattened shape: (192,)

2. Checking X_jets (High Resolution):
------------------------------------------------------------
  Type: <clas

In [8]:
class ResBlock(nn.Module):
  def __init__(self,channels):
    super(ResBlock, self).__init__()
    self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
    self.bn1 = nn.BatchNorm2d(channels)
    self.prelu = nn.PReLU(num_parameters=channels)
    self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
    self.bn2 = nn.BatchNorm2d(channels)

  def forward(self,x):
    residual = self.conv1(x)
    residual = self.bn1(residual)
    residual = self.prelu(residual)
    residual = self.conv2(residual)
    residual = self.bn2(residual)
    return x + residual

In [9]:
class Generator(nn.Module):
  def __init__(self,resblocks = 8,channels = 3):
    super(Generator, self).__init__()
    self.conv1 = nn.Conv2d(channels, 64, kernel_size=9, padding=4)
    self.bn1 = nn.BatchNorm2d(64)
    self.prelu = nn.PReLU(num_parameters=64)
    self.resblocks = nn.Sequential(*[ResBlock(64) for _ in range(resblocks)])
    self.conv2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
    self.bn2 = nn.BatchNorm2d(64)
    self.upsample = nn.Sequential(
            nn.Conv2d(64, 256, kernel_size=3, padding=1),
            nn.PixelShuffle(upscale_factor=2),
            nn.PReLU()
        )
    self.conv3 = nn.Conv2d(64, channels, kernel_size=9, padding=4)

  def forward(self, x):
    block1 = self.prelu(self.conv1(x))
    blocks = self.resblocks(block1)
    block2 = self.bn2(self.conv2(blocks))
    x_up = self.upsample(block1 + block2)
    out = self.conv3(x_up)
    return out[:, :, 1:126, 1:126]


In [None]:
device = 'cuda'
model = Generator().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()
model.train()

In [11]:
import torch

EPOCHS = 20
PATIENCE = 3
best_val_loss = float('inf')
patience_counter = 0
train_lossi = []
val_lossi = []

for epoch in range(EPOCHS):
    model.train()
    running_train_loss = 0.0
    train_count = 0

    for i, (lr, hr) in enumerate(train_loader):
        lr, hr = lr.to(device), hr.to(device)

        optimizer.zero_grad()
        outputs = model(lr)
        loss = criterion(outputs, hr)
        loss.backward()
        optimizer.step()
        train_lossi.append(loss.item())

        running_train_loss += loss.item()
        train_count += 1

        if i % 500 == 0:
            print(f"   Ep {epoch+1} | Step {i} | Train Loss: {loss.item():.6f}")

    avg_train_loss = running_train_loss / train_count

    model.eval()
    running_val_loss = 0.0
    val_count = 0

    with torch.no_grad():
        for val_lr, val_hr in val_loader:
            val_lr, val_hr = val_lr.to(device), val_hr.to(device)
            val_outputs = model(val_lr)
            v_loss = criterion(val_outputs, val_hr)
            val_lossi.append(v_loss.item())
            running_val_loss += v_loss.item()
            val_count += 1

    avg_val_loss = running_val_loss / val_count

    print(f"Epoch {epoch+1} Summary: Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")
    if avg_val_loss < best_val_loss:
        print(f"IMPROVEMENT! ({best_val_loss:.6f} -> {avg_val_loss:.6f})")
        best_val_loss = avg_val_loss
        patience_counter = 0
        save_file = f"/content/sr_model_best.pth"
        torch.save(model.state_dict(), save_file)
        print(f"Saved Best Model to {save_file}")

    else:
        print(f"No Improvement. (Best Val was: {best_val_loss:.6f})")
        patience_counter += 1
        print(f"Patience: {patience_counter}/{PATIENCE}")

        if patience_counter >= PATIENCE:
            print(f"EARLY STOPPING TRIGGERED")
            break

FileNotFoundError: [Errno 2] Failed to open local file '/content/QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272_LR.parquet'. Detail: [errno 2] No such file or directory