CELL 1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

CELL 2

In [None]:
import pandas as pd
import json
import os

CELL 3

In [None]:
base_path = "/content/drive/My Drive/DATASET/"

# Define paths based on the base path
real_images_path = os.path.join(base_path, "real_cifake_images")
fake_images_path = os.path.join(base_path, "fake_cifake_images")
real_json_path = os.path.join(base_path, "real_cifake_preds.json")
fake_json_path = os.path.join(base_path, "fake_cifake_preds.json")

output_csv_path = "/content/training_master.csv"

print(f"Real images folder: {real_images_path}")
print(f"Fake images folder: {fake_images_path}")
print(f"Real JSON file: {real_json_path}")
print(f"Fake JSON file: {fake_json_path}")
print(f"Output CSV will be saved to: {output_csv_path}")

CELL 4

In [None]:
# This list will hold all our data before we make the CSV
master_list = []

# --- 1. Process the REAL images ---
print("Processing real_cifake_preds.json...")
with open(real_json_path, 'r') as f:
    real_data = json.load(f)

for item in real_data:
    index = item['index']
    label = item['prediction']  # This will be 'real' or 'fake'

    # The filenames are 1.png, 2.png, etc.
    image_name = f"{index}.png"
    # -----------

    # Create the full path to the image
    image_path = os.path.join(real_images_path, image_name)

    # Add the data to our master list
    master_list.append({
        'image_path': image_path,
        'label': label
    })

print(f"Processed {len(real_data)} real image records.")

# --- 2. Process the FAKE images ---
print("\nProcessing fake_cifake_preds.json...")
with open(fake_json_path, 'r') as f:
    fake_data = json.load(f)

for item in fake_data:
    index = item['index']
    label = item['prediction']  # This will always be 'fake'

    # --- FIX ---
    image_name = f"{index}.png"
    # -----------

    image_path = os.path.join(fake_images_path, image_name)

    # Add the data to our master list
    master_list.append({
        'image_path': image_path,
        'label': label
    })

print(f"Processed {len(fake_data)} fake image records.")

# --- 3. Convert to Pandas DataFrame and Save ---
print("\nConverting to DataFrame...")
df = pd.DataFrame(master_list)

# Save the DataFrame to our CSV file (using the local path)
df.to_csv(output_csv_path, index=False)

print(f"\nSUCCESS! Master CSV created with {len(df)} total records.")
print(f"File saved to: {output_csv_path}")

# --- 4. Display a sample of the data ---
print("\nHere's a sample of your master dataset:")
print(df.head())

CELL 5

In [None]:
# Install our required libraries
!pip install timm scikit-learn

CELL 6

In [None]:
# [THIS IS THE CORRECTED CELL 5.5]

import os
import time

print("Starting to copy dataset from Drive to local Colab disk...")
print("This might take a minute or two, but we only do it once.")
start_time = time.time()

# Define source paths on Google Drive
drive_real_path = "/content/drive/My Drive/DATASET/real_cifake_images"
drive_fake_path = "/content/drive/My Drive/DATASET/fake_cifake_images"

# Define destination paths on the fast local Colab disk
local_real_path = "/content/local_dataset/real_cifake_images"
local_fake_path = "/content/local_dataset/fake_cifake_images"

# Create the local directories
os.makedirs(local_real_path, exist_ok=True)
os.makedirs(local_fake_path, exist_ok=True)

# --- We are now copying *.png files ---
print("Copying REAL images (approx. 1000 files)...")
!cp -n "/content/drive/My Drive/DATASET/real_cifake_images"/*.png "/content/local_dataset/real_cifake_images/"

print("Copying FAKE images (approx. 1000 files)...")
!cp -n "/content/drive/My Drive/DATASET/fake_cifake_images"/*.png "/content/local_dataset/fake_cifake_images/"
# --- END OF FIX ---

end_time = time.time()
print(f"\n--- Copy complete in {end_time - start_time:.2f} seconds! ---")
print("All images are now on the fast local disk.")

CELL 7

In [None]:
# [THIS IS THE NEW, CORRECTED CELL 7]

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import cv2  # OpenCV for loading images
import pandas as pd
from sklearn.model_selection import train_test_split
import timm
from tqdm import tqdm # A nice progress bar

# --- 1. Basic Setup ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Define our label mapping
label_map = {"real": 0, "fake": 1}

# --- 2. Load and Split the Data ---
csv_path = "/content/training_master.csv"
df = pd.read_csv(csv_path)

# We must update the paths to point to the new local copy from Cell 6
print("Updating image paths to point to the new local copy...")
drive_base_path = "/content/drive/My Drive/DATASET"
local_base_path = "/content/local_dataset"
df['image_path'] = df['image_path'].str.replace(drive_base_path, local_base_path)
print("Paths updated. Here is a sample of the new path:")
print(df.iloc[0]['image_path'])
# --- END---

# Map string labels to numbers
df['label_id'] = df['label'].map(label_map)

# Split the data (80% train, 20% validation)
train_df, val_df = train_test_split(
    df,
    test_size=0.2,  # 20% for validation
    random_state=42, # For reproducible results
    stratify=df['label_id']
)

print(f"Total images: {len(df)}")
print(f"Training images: {len(train_df)}")
print(f"Validation images: {len(val_df)}")

CELL 8

In [None]:
# [THIS IS THE NEW MODIFIED CELL 8 - EXPERIMENT 4]

# ImageNet stats are standard for pre-trained models
IMG_SIZE = 224
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

# --- CHANGE 1: AGGRESSIVE AUGMENTATIONS ---
# We are making the training images much harder to "memorize"
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    # TrivialAugment is a very strong, modern augmentation policy
    transforms.TrivialAugmentWide(interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    # RandomErasing cuts out a random patch, forcing the model
    # to learn from all parts of the image, not just one clue.
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0),
])
# ----------------------------------------

# Simpler transforms for the validation set (NO CHANGE HERE)
val_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

# --- 4. Create the Custom Dataset Class (NO CHANGE HERE) ---
class DeepfakeCloneDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.image_paths = df['image_path'].values
        self.labels = df['label_id'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        img_path = self.image_paths[index]
        label = self.labels[index]
        try:
            image = cv2.imread(img_path)
            if image is None:
                raise IOError(f"Could not read image: {img_path}")
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            image = torch.zeros((3, IMG_SIZE, IMG_SIZE), dtype=torch.uint8)
            label = 0
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label, dtype=torch.long)

print("Dataset class and NEW AGGRESSIVE transforms defined.")

CELL 9

In [None]:
# [THIS IS THE NEW MODIFIED CELL 9 - FOR V7]

# --- CHANGE 1: BATCH_SIZE must be smaller for this huge model ---
BATCH_SIZE = 8
# -----------------------------------------------------------

# Create datasets
train_dataset = DeepfakeCloneDataset(train_df, transform=train_transform)
val_dataset = DeepfakeCloneDataset(val_df, transform=val_transform)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    # --- CHANGE 2: num_workers=0 to save RAM ---
    num_workers=0
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    # --- CHANGE 2: num_workers=0 to save RAM ---
    num_workers=0
)

print(f"DataLoaders created with NEW batch size {BATCH_SIZE} and 0 workers.")

CELL 10

In [None]:
# [THIS IS THE NEW MODIFIED CELL 10 - EXPERIMENT 5]

import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import timm

# --- 1. Load the NEW Model ---
# --- CHANGE 1: We are upgrading to ConvNeXt-Base ---
model = timm.create_model(
    'convnext_base',
    pretrained=True,
    num_classes=2
)
# --------------------------------------------------
model.to(device)

# --- 2. Define Loss, Optimizer, Scheduler (Same as last experiment) ---
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=2
)

# --- CHANGE 2: New path for our v7 model (SAVING TO GOOGLE DRIVE) ---
BEST_MODEL_PATH = "/content/drive/My Drive/best_model_v7.pth"
best_val_accuracy = 0.0

print("V7 Model (ConvNeXt-Base + Aggressive Aug), saving to Drive. Ready to train.")

CELL 11

In [None]:
# --- 3. Define the Training Function ---
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()  # Set model to training mode

    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    # Use tqdm for a progress bar
    for images, labels in tqdm(dataloader, desc="Training"):
        # Move data to GPU
        images, labels = images.to(device), labels.to(device)

        # --- Forward pass ---
        # Get model outputs (logits)
        outputs = model(images)

        # Calculate loss
        loss = criterion(outputs, labels)

        # --- Backward pass and optimization ---
        optimizer.zero_grad() # Clear old gradients
        loss.backward()       # Calculate new gradients
        optimizer.step()      # Update model weights

        # --- Statistics ---
        total_loss += loss.item() * images.size(0)

        # Get predictions (the class with the highest score)
        _, predicted = torch.max(outputs.data, 1)

        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    epoch_loss = total_loss / total_samples
    epoch_accuracy = (total_correct / total_samples) * 100
    return epoch_loss, epoch_accuracy

# --- 4. Define the Validation Function ---
def validate_one_epoch(model, dataloader, criterion, device):
    model.eval()  # Set model to evaluation mode (disables dropout, etc.)

    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    # We don't need to calculate gradients during validation
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Validating"):
            # Move data to GPU
            images, labels = images.to(device), labels.to(device)

            # --- Forward pass ---
            outputs = model(images)

            # Calculate loss
            loss = criterion(outputs, labels)

            # --- Statistics ---
            total_loss += loss.item() * images.size(0)

            # Get predictions
            _, predicted = torch.max(outputs.data, 1)

            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    epoch_loss = total_loss / total_samples
    epoch_accuracy = (total_correct / total_samples) * 100
    return epoch_loss, epoch_accuracy

print("Training and Validation helper functions are defined.")

CELL 12

In [None]:
# [THIS IS THE NEW MODIFIED CELL 12 - EXPERIMENT 4]

# --- CHANGE 1: Train for 30 epochs ---
# The task is harder, so we need to train for longer.
NUM_EPOCHS = 30
# ------------------------------------

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch + 1} / {NUM_EPOCHS} ---")

    train_loss, train_acc = train_one_epoch(
        model, train_loader, criterion, optimizer, device
    )
    val_loss, val_acc = validate_one_epoch(
        model, val_loader, criterion, device
    )

    scheduler.step(val_loss)

    print(f"Epoch {epoch + 1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Acc: {val_acc:.2f}%")

    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"  *** New best model saved to Drive! Accuracy: {val_acc:.2f}% ***")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_val_accuracy:.2f}%")
print(f"Best model saved to: {BEST_MODEL_PATH}")

CELL 13

In [None]:
# [THIS IS YOUR CELL 13]

import time
import os

print("Starting to copy TEST dataset from Drive to local Colab disk...")
start_time = time.time()

# Define source and destination paths
drive_test_path = "/content/drive/My Drive/DATASET/test"
local_test_path = "/content/local_dataset/test"

# Create the local directory
os.makedirs(local_test_path, exist_ok=True)

# Copy all .png files
print("Copying TEST images...")
!cp -n "/content/drive/My Drive/DATASET/test/"*.png "/content/local_dataset/test/"

end_time = time.time()
print(f"\n--- Test set copy complete in {end_time - start_time:.2f} seconds! ---")

CELL 14

In [None]:
# [THIS IS THE FINAL CORRECTED CELL 14]

import glob
import timm
import pandas as pd
from torch.utils.data import DataLoader

# --- 1. Define the correct model path ---
BEST_MODEL_PATH = "/content/drive/My Drive/best_model_v7.pth" # Correct V7 path

# --- 2. Load the Best Model (94.25% score) ---
# First, re-create the 'convnext_base' structure
model = timm.create_model('convnext_base', pretrained=False, num_classes=2)
model.load_state_dict(torch.load(BEST_MODEL_PATH))
model.to(device)
model.eval()
print(f"Best model (94.25%) loaded from '{BEST_MODEL_PATH}'.")

# --- 3. Find All Test Images ---
local_test_image_paths = sorted(
    glob.glob("/content/local_dataset/test/*.png"),
    key=lambda x: int(os.path.basename(x).split('.')[0])
)
test_df = pd.DataFrame({'image_path': local_test_image_paths, 'label_id': 0})

# --- 4. Create Test DataFrame and DataLoader (FIXED) ---
test_dataset = DeepfakeCloneDataset(test_df, transform=val_transform)
test_loader = DataLoader(
    test_dataset,
    # --- FIX: Use BATCH_SIZE=8 ---
    batch_size=BATCH_SIZE, # BATCH_SIZE is 8
    shuffle=False,
    # --- FIX: Use num_workers=0 ---
    num_workers=0
)
print("Test DataLoader is ready.")

CELL 15

In [None]:
# [THIS IS THE NEW MODIFIED CELL 15 - FOR V7]

import json
from tqdm import tqdm

inv_label_map = {0: "real", 1: "fake"}
all_predictions = []

# --- 1. Get All Predictions from the Model ---
with torch.no_grad():
    for images, _ in tqdm(test_loader, desc="Predicting on test set"):
        images = images.to(device)
        outputs = model(images)
        _, predicted_ids = torch.max(outputs.data, 1)
        all_predictions.extend(predicted_ids.cpu().numpy())
print(f"Generated {len(all_predictions)} predictions.")

# --- 2. Format the Predictions into the JSON ---
final_json_output = []
for i, pred_id in enumerate(all_predictions):
    image_filename = os.path.basename(test_df.iloc[i]['image_path'])
    image_index = int(image_filename.split('.')[0])
    label_str = inv_label_map[pred_id]

    final_json_output.append({
        "index": image_index,
        "prediction": "fake" if label_str == "fake" else "real"
    })

# --- 3. Save the JSON File (MODIFIED) ---
submission_file_path = "/content/drive/My Drive/Catalyst_prediction_v7.json"

with open(submission_file_path, 'w') as f:
    json.dump(final_json_output, f, indent=4)

print(f"\n--- SUCCESS! ---")
print(f"V7 Submission file saved to your Google Drive: {submission_file_path}")