In [1]:
import os
import cv2
import torch
import torchaudio
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from PIL import Image
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
KAGGLE_INPUT_PATH = "/kaggle/input"
PROCESSED_DATA_PATH = "/kaggle/working/processed_data"
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [3]:

class RobustMultiModalDatasetManager:
    def __init__(self, kaggle_input_path="/kaggle/input"):
        self.kaggle_input_path = kaggle_input_path
        self.processed_data_path = "/kaggle/working/processed_data"
        os.makedirs(self.processed_data_path, exist_ok=True)
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def extract_frames_from_video(self, video_path, num_frames=5, target_size=(224, 224)):
        try:
            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                return []
            frames = []
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            if frame_count == 0:
                cap.release()
                return []
            frame_indices = np.linspace(0, frame_count-1, num_frames, dtype=int) if frame_count > num_frames else list(range(frame_count))
            for frame_idx in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                ret, frame = cap.read()
                if ret:
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frame = cv2.resize(frame, target_size)
                    frames.append(frame)
            cap.release()
            return frames
        except Exception as e:
            print(f"Error processing video {video_path}: {str(e)}")
            return []

    def process_ff_c23_dataset(self):
        print("Processing FF-C23 dataset...")
        dataset_path = os.path.join(self.kaggle_input_path, "ff-c23", "FaceForensics++_C23")
        processed_data = []
        categories = ['original', 'Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
        for category in categories:
            category_path = os.path.join(dataset_path, category)
            if os.path.exists(category_path):
                label = 0 if category == 'original' else 1
                for video_file in os.listdir(category_path):
                    if video_file.endswith('.mp4'):
                        video_path = os.path.join(category_path, video_file)
                        frames = self.extract_frames_from_video(video_path)
                        if frames:
                            processed_data.append({'video_file': video_file, 'category': category, 'label': label, 'frames': frames, 'num_frames': len(frames)})
        pd.DataFrame(processed_data).to_pickle(os.path.join(self.processed_data_path, "ff_c23_processed.pkl"))
        print(f"FF-C23 processed: {len(processed_data)} videos")
        return processed_data

    def process_celebdf_v2_dataset(self):
        print("Processing CelebDF-v2 dataset...")
        dataset_path = os.path.join(self.kaggle_input_path, "celebdf-v2image-dataset", "Celeb_V2")
        processed_data = []
        for split in ["Train", "Test", "Val"]:
            for label_folder in ["real", "fake"]:
                folder_path = os.path.join(dataset_path, split, label_folder)
                if os.path.exists(folder_path):
                    label = 0 if label_folder == "real" else 1
                    for img_file in os.listdir(folder_path):
                        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                            img_path = os.path.join(folder_path, img_file)
                            processed_data.append({'image_file': img_file, 'image_path': img_path, 'split': split, 'label': label, 'label_name': label_folder})
        pd.DataFrame(processed_data).to_csv(os.path.join(self.processed_data_path, "celebdf_v2_processed.csv"), index=False)
        print(f"CelebDF-v2 processed: {len(processed_data)} images")
        return processed_data

    


    def process_asvspoof_21_cqt_dataset(self):
        print("Processing ASVspoof-21 CQT dataset...")
        dataset_path = os.path.join(self.kaggle_input_path, "asvspoof-21-df-cqt", "my_dataset")
        processed_data = []
        for split in ['train', 'test', 'validation']:
            for label_folder in ['real', 'fake']:
                folder_path = os.path.join(dataset_path, split, label_folder)
                if os.path.exists(folder_path):
                    label = 0 if label_folder == "real" else 1
                    for img_file in os.listdir(folder_path):
                        if img_file.lower().endswith('.png'):
                            img_path = os.path.join(folder_path, img_file)
                            processed_data.append({'image_file': img_file, 'image_path': img_path, 'split': split, 'label': label, 'label_name': label_folder})
        pd.DataFrame(processed_data).to_csv(os.path.join(self.processed_data_path, "asvspoof_21_cqt_processed.csv"), index=False)
        print(f"ASVspoof-21 CQT processed: {len(processed_data)} images")
        return processed_data

    

    

    def process_dfdc_faces_dataset(self):
        print("Processing DFDC faces dataset...")
        dataset_path = os.path.join(self.kaggle_input_path, "dfdc-faces-of-the-train-sample")
        processed_data = []
        for split in ['train', 'validation']:
            for label_folder in ['real', 'fake']:
                folder_path = os.path.join(dataset_path, split, label_folder)
                if os.path.exists(folder_path):
                    label = 0 if label_folder == 'real' else 1
                    for img_file in os.listdir(folder_path):
                        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                            img_path = os.path.join(folder_path, img_file)
                            processed_data.append({'image_file': img_file, 'image_path': img_path, 'split': split, 'label': label, 'label_name': label_folder})
        pd.DataFrame(processed_data).to_csv(os.path.join(self.processed_data_path, "dfdc_faces_processed.csv"), index=False)
        print(f"DFDC faces processed: {len(processed_data)} images")
        return processed_data

    def process_in_the_wild_audio(self):
        print("Processing In-The-Wild audio deepfake dataset...")
        dataset_path = os.path.join(self.kaggle_input_path, "in-the-wild-audio-deepfake", "release_in_the_wild")
        processed_data = []
        for label_folder in ["real", "fake"]:
            folder_path = os.path.join(dataset_path, label_folder)
            if os.path.exists(folder_path):
                label = 0 if label_folder == "real" else 1
                for audio_file in os.listdir(folder_path):
                    audio_path = os.path.join(folder_path, audio_file)
                    processed_data.append({
                        'audio_file': audio_file,
                        'audio_path': audio_path,
                        'label': label,
                        'label_name': label_folder
                    })
        pd.DataFrame(processed_data).to_csv(os.path.join(self.processed_data_path, "in_the_wild_audio_processed.csv"), index=False)
        print(f"In-The-Wild audio processed: {len(processed_data)} samples")
        return processed_data

    def process_all_datasets(self):
        print("Starting comprehensive dataset processing...")
        print("=" * 80)
        results = {}
        datasets = [
            ('ff_c23', self.process_ff_c23_dataset),
            ('celebdf_v2', self.process_celebdf_v2_dataset),
            ('asvspoof_21_cqt', self.process_asvspoof_21_cqt_dataset),
            ('dfdc_faces', self.process_dfdc_faces_dataset),
            ('in_the_wild_audio', self.process_in_the_wild_audio)
        ]
        for dataset_name, process_func in datasets:
            try:
                print(f"\n{'-'*60}")
                results[dataset_name] = process_func()
            except Exception as e:
                print(f"Error processing {dataset_name}: {e}")
                results[dataset_name] = []
        self.generate_processing_summary(results)
        return results

    def generate_processing_summary(self, results):
        summary = {'Dataset': [], 'Status': [], 'Total_Samples': [], 'Real_Samples': [], 'Fake_Samples': [], 'Data_Type': []}
        for dataset_name, data in results.items():
            summary['Dataset'].append(dataset_name)
            if data and len(data) > 0:
                summary['Status'].append('Success')
                summary['Total_Samples'].append(len(data))
                real_count = sum(1 for item in data if item.get('label') == 0)
                fake_count = sum(1 for item in data if item.get('label') == 1)
                summary['Real_Samples'].append(real_count)
                summary['Fake_Samples'].append(fake_count)
                if 'frames' in data[0]:
                    summary['Data_Type'].append('Video')
                elif 'image_path' in data[0]:
                    summary['Data_Type'].append('Image')
                elif 'audio_path' in data[0]:
                    summary['Data_Type'].append('Audio')
                else:
                    summary['Data_Type'].append('Unknown')
            else:
                summary['Status'].append('Failed/Empty')
                summary['Total_Samples'].append(0)
                summary['Real_Samples'].append(0)
                summary['Fake_Samples'].append(0)
                summary['Data_Type'].append('Unknown')
        summary_df = pd.DataFrame(summary)
        summary_path = os.path.join(self.processed_data_path, "processing_summary.csv")
        summary_df.to_csv(summary_path, index=False)
        print("\n" + "="*80)
        print("DATASET PROCESSING SUMMARY")
        print("="*80)
        print(summary_df.to_string(index=False))
        print(f"\nSummary saved to: {summary_path}")


In [4]:
# Advanced Data Augmentation Pipeline
class AdvancedDataAugmentation:
    """
    Advanced data augmentation techniques for deepfake detection
    """
    
    def __init__(self):
        self.setup_augmentations()
    
    def setup_augmentations(self):
        """Setup various augmentation techniques"""
        
        # Geometric augmentations
        self.geometric_transforms = transforms.Compose([
            transforms.RandomRotation(degrees=15),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
            transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
        ])
        
        # Color augmentations
        self.color_transforms = transforms.Compose([
            transforms.ColorJitter(brightness=0.3, contrast=0.3, 
                                 saturation=0.3, hue=0.1),
            transforms.RandomGrayscale(p=0.1),
        ])
        
        # Noise augmentations
        self.noise_transforms = transforms.Compose([
            transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
            transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.5),
        ])
    
    def apply_cutmix(self, images, labels, alpha=1.0):
        """Apply CutMix augmentation"""
        batch_size = images.size(0)
        indices = torch.randperm(batch_size)
        
        lam = np.random.beta(alpha, alpha)
        
        bbx1, bby1, bbx2, bby2 = self.rand_bbox(images.size(), lam)
        images[:, :, bbx1:bbx2, bby1:bby2] = images[indices, :, bbx1:bbx2, bby1:bby2]
        
        # Adjust labels
        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (images.size()[-1] * images.size()[-2]))
        
        return images, labels, labels[indices], lam
    
    def rand_bbox(self, size, lam):
        """Generate random bounding box for CutMix"""
        W = size[2]
        H = size[3]
        cut_rat = np.sqrt(1. - lam)
        cut_w = np.int(W * cut_rat)
        cut_h = np.int(H * cut_rat)
        
        # Uniform
        cx = np.random.randint(W)
        cy = np.random.randint(H)
        
        bbx1 = np.clip(cx - cut_w // 2, 0, W)
        bby1 = np.clip(cy - cut_h // 2, 0, H)
        bbx2 = np.clip(cx + cut_w // 2, 0, W)
        bby2 = np.clip(cy + cut_h // 2, 0, H)
        
        return bbx1, bby1, bbx2, bby2

# Audio-specific preprocessing
class AudioPreprocessor:
    """
    Audio preprocessing utilities for deepfake detection
    """
    
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
    
    def extract_mfcc_features(self, audio_path, n_mfcc=13):
        """Extract MFCC features from audio"""
        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        return mfcc
    
    def extract_spectral_features(self, audio_path):
        """Extract spectral features"""
        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        features = {
            'spectral_centroid': librosa.feature.spectral_centroid(y=audio, sr=sr),
            'spectral_rolloff': librosa.feature.spectral_rolloff(y=audio, sr=sr),
            'zero_crossing_rate': librosa.feature.zero_crossing_rate(audio),
            'tempo': librosa.beat.tempo(y=audio, sr=sr)[0]
        }
        
        return features
    
    def apply_audio_augmentation(self, audio, sr):
        """Apply audio augmentation techniques"""
        # Time stretching
        if np.random.random() < 0.5:
            rate = np.random.uniform(0.8, 1.2)
            audio = librosa.effects.time_stretch(audio, rate=rate)
        
        # Pitch shifting
        if np.random.random() < 0.5:
            n_steps = np.random.uniform(-4, 4)
            audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
        
        # Add noise
        if np.random.random() < 0.3:
            noise = np.random.normal(0, 0.005, audio.shape)
            audio = audio + noise
        
        return audio

# Data Quality Assurance
class DataQualityChecker:
    """
    Quality assurance for processed datasets
    """
    
    def __init__(self, processed_data_path):
        self.processed_data_path = processed_data_path
    
    def validate_image_data(self, data):
        """Validate image data quality"""
        issues = []
        
        for i, sample in enumerate(data):
            try:
                img = Image.open(sample['image_path'])
                
                # Check image dimensions
                if img.size[0] < 64 or img.size[1] < 64:
                    issues.append(f"Image {i} too small: {img.size}")
                
                # Check image format
                if img.mode not in ['RGB', 'RGBA', 'L']:
                    issues.append(f"Image {i} unsupported mode: {img.mode}")
                
                # Check file corruption
                img.verify()
                
            except Exception as e:
                issues.append(f"Image {i} corrupted: {str(e)}")
        
        return issues
    
    def validate_audio_data(self, data):
        """Validate audio data quality"""
        issues = []
        
        for i, sample in enumerate(data):
            try:
                if 'mel_spectrogram' in sample:
                    mel_spec = sample['mel_spectrogram']
                    
                    # Check spectrogram shape
                    if len(mel_spec.shape) != 2:
                        issues.append(f"Audio {i} invalid spectrogram shape: {mel_spec.shape}")
                    
                    # Check for NaN values
                    if np.isnan(mel_spec).any():
                        issues.append(f"Audio {i} contains NaN values")
                    
                    # Check for infinite values
                    if np.isinf(mel_spec).any():
                        issues.append(f"Audio {i} contains infinite values")
                        
            except Exception as e:
                issues.append(f"Audio {i} processing error: {str(e)}")
        
        return issues
    
    def generate_quality_report(self):
        """Generate comprehensive quality report"""
        report = {
            'dataset': [],
            'total_samples': [],
            'valid_samples': [],
            'corrupted_samples': [],
            'issues': []
        }
        
        # Check all processed datasets
        for file_name in os.listdir(self.processed_data_path):
            if file_name.endswith('.csv'):
                dataset_name = file_name.replace('.csv', '')
                file_path = os.path.join(self.processed_data_path, file_name)
                
                df = pd.read_csv(file_path)
                issues = self.validate_image_data(df.to_dict('records'))
                
                report['dataset'].append(dataset_name)
                report['total_samples'].append(len(df))
                report['valid_samples'].append(len(df) - len(issues))
                report['corrupted_samples'].append(len(issues))
                report['issues'].append(issues[:5])  # First 5 issues
        
        return pd.DataFrame(report)



In [5]:
class MultiModalDeepfakeDataset(Dataset):
    def __init__(self, data, transform=None, data_type='image'):
        self.data = data
        self.transform = transform
        self.data_type = data_type
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        label = sample['label']
        
        if self.data_type == 'image':
            image = Image.open(sample['image_path']).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, label
        
        elif self.data_type == 'video':
            frames = sample['frames']
            if self.transform:
                frames = [self.transform(Image.fromarray(frame)) for frame in frames]
            return torch.stack(frames), label
        
        elif self.data_type == 'audio':
            mel_spec = sample['mel_spectrogram']
            mel_spec = torch.FloatTensor(mel_spec).unsqueeze(0)
            return mel_spec, label
        else:
            raise ValueError(f"Unsupported data type: {self.data_type}")


In [6]:
# Usage Example
if __name__ == "__main__":
    
    dataset_manager = RobustMultiModalDatasetManager()
    
    # Process all datasets
    results = dataset_manager.process_all_datasets()
    
    print("\nPhase 2: Data Collection and Preparation completed successfully!")
    print("Processed datasets are saved in:", dataset_manager.processed_data_path)

Starting comprehensive dataset processing...

------------------------------------------------------------
Processing FF-C23 dataset...
FF-C23 processed: 5000 videos

------------------------------------------------------------
Processing CelebDF-v2 dataset...
CelebDF-v2 processed: 101031 images

------------------------------------------------------------
Processing ASVspoof-21 CQT dataset...
ASVspoof-21 CQT processed: 611828 images

------------------------------------------------------------
Processing DFDC faces dataset...
DFDC faces processed: 124647 images

------------------------------------------------------------
Processing In-The-Wild audio deepfake dataset...
In-The-Wild audio processed: 31779 samples

DATASET PROCESSING SUMMARY
          Dataset  Status  Total_Samples  Real_Samples  Fake_Samples Data_Type
           ff_c23 Success           5000          1000          4000     Video
       celebdf_v2 Success         101031         50360         50671     Image
  asvspoof_2