In [1]:
%%capture
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import librosa
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Set random seeds for reproducibility
SEED = 93
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [5]:
# Define parameters
SAMPLE_RATE = 16000
MAX_AUDIO_LENGTH = 8  # in seconds
FEATURE_DIM = 256
BATCH_SIZE = 32
NUM_EPOCHS = 11
LEARNING_RATE = 0.0001

In [6]:
# Load dataset from CSV files
print("Loading dataset...")
train_df = pd.read_csv("/kaggle/input/multilingual-indian-speech-data/metadata/train.csv")
test_df = pd.read_csv("/kaggle/input/multilingual-indian-speech-data/metadata/test.csv")

print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")

Loading dataset...
Train samples: 31102, Test samples: 2635


In [7]:
# Define audio paths
AUDIO_DIR = "/kaggle/input/multilingual-indian-speech-data/audio"

# Function to load audio file
def load_audio(file_id):
    file_path = os.path.join(AUDIO_DIR, f"{file_id}.wav")
    try:
        audio, sr = librosa.load(file_path, sr=None)
        return audio, sr
    except Exception as e:
        print(f"Error loading audio file {file_id}: {e}")
        return None, None

# Function to extract audio features
def extract_features(audio, sr=SAMPLE_RATE, max_length=MAX_AUDIO_LENGTH):
    if audio is None:
        return None
        
    # Resample if needed
    if sr != SAMPLE_RATE:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)
    
    # Pad or trim to fixed length
    target_length = max_length * SAMPLE_RATE
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        audio = np.pad(audio, (0, max(0, target_length - len(audio))), mode='constant')
    
    # Extract Mel spectrogram features
    mel_spec = librosa.feature.melspectrogram(
        y=audio, 
        sr=SAMPLE_RATE,
        n_mels=FEATURE_DIM,
        hop_length=512
    )
    
    # Convert to log scale
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize
    log_mel_spec = (log_mel_spec - log_mel_spec.mean()) / (log_mel_spec.std() + 1e-8)
    
    return log_mel_spec

# Feature extraction for training data
def extract_and_save_features(df, output_dir, is_train=True):
    os.makedirs(output_dir, exist_ok=True)
    
    features_list = []
    labels_list = []
    ids_list = []
    
    for i, row in enumerate(tqdm(df.itertuples(), total=len(df), desc="Extracting features")):
        file_id = row.id
        audio, sr = load_audio(file_id)
        
        if audio is None:
            continue
            
        try:
            features = extract_features(audio, sr)
            if features is None:
                continue
                
            features_list.append(features)
            
            if is_train:
                labels_list.append(row.is_tts)
            else:
                labels_list.append(-1)  # For test set
            
            ids_list.append(file_id)
            
            # Save in batches to avoid memory issues
            if (i + 1) % 1000 == 0 or i == len(df) - 1:
                batch_idx = i // 1000
                np.save(f"{output_dir}/features_batch_{batch_idx}.npy", np.array(features_list))
                np.save(f"{output_dir}/labels_batch_{batch_idx}.npy", np.array(labels_list))
                np.save(f"{output_dir}/ids_batch_{batch_idx}.npy", np.array(ids_list))
                
                features_list = []
                labels_list = []
                ids_list = []
                
        except Exception as e:
            print(f"Error processing audio {file_id}: {e}")
            continue

In [8]:
import librosa
import numpy as np

# Sample a few files to check duration
sample_ids = train_df['id'].sample(100).values
durations = []

for file_id in sample_ids:
    file_path = os.path.join(AUDIO_DIR, f"{file_id}.wav")
    try:
        audio, sr = librosa.load(file_path, sr=None)
        duration = len(audio) / sr
        durations.append(duration)
    except:
        pass

print(f"Average duration: {np.mean(durations):.2f} seconds")
print(f"Max duration: {np.max(durations):.2f} seconds")
print(f"Min duration: {np.min(durations):.2f} seconds")

Average duration: 6.55 seconds
Max duration: 12.00 seconds
Min duration: 1.13 seconds


In [9]:

# Create directories for features
os.makedirs("features", exist_ok=True)
train_features_dir = "features/train"
test_features_dir = "features/test"

# Extract and save features
print("Extracting features for training data...")
extract_and_save_features(train_df, train_features_dir, is_train=True)
print("Extracting features for test data...")
extract_and_save_features(test_df, test_features_dir, is_train=False)

Extracting features for training data...


Extracting features:   0%|          | 0/31102 [00:00<?, ?it/s]

Extracting features for test data...


Extracting features:   0%|          | 0/2635 [00:00<?, ?it/s]

In [10]:
# Function to load saved features
def load_features(features_dir):
    all_features = []
    all_labels = []
    all_ids = []
    
    feature_files = [f for f in os.listdir(features_dir) if f.startswith('features_batch_')]
    
    for batch_idx in range(len(feature_files)):
        features = np.load(f"{features_dir}/features_batch_{batch_idx}.npy")
        labels = np.load(f"{features_dir}/labels_batch_{batch_idx}.npy")
        ids = np.load(f"{features_dir}/ids_batch_{batch_idx}.npy")
        
        all_features.append(features)
        all_labels.append(labels)
        all_ids.append(ids)
    
    return np.vstack(all_features), np.concatenate(all_labels), np.concatenate(all_ids)

In [11]:
# Load features
print("Loading saved features...")
train_features, train_labels, train_ids = load_features(train_features_dir)
test_features, _, test_ids = load_features(test_features_dir)

# Split training data for validation
train_features, val_features, train_labels, val_labels = train_test_split(
    train_features, train_labels, test_size=0.1, random_state=SEED, stratify=train_labels
)

print(f"Training features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")
print(f"Test features shape: {test_features.shape}")

Loading saved features...
Training features shape: (27991, 256, 251)
Validation features shape: (3111, 256, 251)
Test features shape: (2635, 256, 251)


In [12]:
# Custom Dataset
class AudioFeatureDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = torch.FloatTensor(self.features[idx])
        
        if self.labels is not None:
            label = torch.FloatTensor([self.labels[idx]])
            return feature, label
        return feature

In [13]:
import torch.nn.functional as F

# Define the ChannelAttention class
class ChannelAttention(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.fc = nn.Sequential(
            nn.Conv2d(in_channels, in_channels // reduction_ratio, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(in_channels // reduction_ratio, in_channels, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out) * x
        
class DeepfakeDetector(nn.Module):
    def __init__(self, input_dim=FEATURE_DIM):
        super(DeepfakeDetector, self).__init__()
        
        # Improved CNN blocks with residual connections
        self.conv1 = self._make_layer(1, 32)
        self.conv2 = self._make_layer(32, 64)
        self.conv3 = self._make_layer(64, 128)
        self.conv4 = self._make_layer(128, 256)
        
        # Attention mechanism
        self.channel_attention = ChannelAttention(256)
        
        # Global pooling instead of flatten
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        
        # Improved classifier
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
        
    def _make_layer(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2)
        )
        
    def forward(self, x):
        # Add channel dimension
        x = x.unsqueeze(1)
        
        # CNN layers with residual connections
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        
        # Apply attention
        x = self.channel_attention(x)
        
        # Global pooling and classification
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        
        return torch.sigmoid(x)


In [14]:
# Create data loaders
train_dataset = AudioFeatureDataset(train_features, train_labels)
val_dataset = AudioFeatureDataset(val_features, val_labels)
test_dataset = AudioFeatureDataset(test_features)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Initialize model, loss, and optimizer
model = DeepfakeDetector().to(device)
criterion = nn.BCELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.5, weight_decay=1e-4)

# Create Adam optimizer with recommended parameters
# optimizer = torch.optim.Adam(
#     model.parameters(),
#     lr=0.0001,           # Lower learning rate (0.0001-0.001) works well for deepfake detection
#     betas=(0.9, 0.999),  # Default momentum parameters work well for most cases
#     eps=1e-8,            # Default numerical stability constant
#     weight_decay=1e-5,   # Light regularization to prevent overfitting
#     amsgrad=False        # Standard setting (disable AMSGrad variant)
# )

# Optional: Create a learning rate scheduler to improve training
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.005,                      
    steps_per_epoch=len(train_loader),  
    epochs=11,                          
    pct_start=0.2,                      
    anneal_strategy='cos',              
    div_factor=10.0,                    
    final_div_factor=1000.0             
)

# Add a learning rate scheduler
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer, 
#     mode='max',  # Since you're likely monitoring AUC or accuracy
#     factor=0.3,  # This is reasonable
#     patience=7,  # Consider increasing slightly for more stability
#     verbose=True,
#     min_lr=1e-6  # Add a minimum learning rate to prevent it from becoming too small
# )

In [15]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
import numpy as np

# Training function
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    all_pred_classes = []
    
    for features, labels in tqdm(dataloader, desc="Training"):
        features, labels = features.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * features.size(0)
        all_predictions.extend(outputs.detach().cpu().numpy())
        all_targets.extend(labels.cpu().numpy())
        
        # For accuracy, precision, and recall, we need class predictions (0 or 1)
        pred_classes = (outputs > 0.5).float().cpu().numpy()
        all_pred_classes.extend(pred_classes)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    
    # Convert lists to numpy arrays for metric calculation
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    all_pred_classes = np.array(all_pred_classes)
    
    # Calculate metrics
    auc_score = roc_auc_score(all_targets, all_predictions)
    accuracy = accuracy_score(all_targets, all_pred_classes)
    precision = precision_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    recall = recall_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    
    return epoch_loss, auc_score, accuracy, precision, recall

# Validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    all_pred_classes = []
    
    with torch.no_grad():
        for features, labels in tqdm(dataloader, desc="Validating"):
            features, labels = features.to(device), labels.to(device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * features.size(0)
            all_predictions.extend(outputs.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
            
            # For accuracy, precision, and recall
            pred_classes = (outputs > 0.5).float().cpu().numpy()
            all_pred_classes.extend(pred_classes)
    
    val_loss = running_loss / len(dataloader.dataset)
    
    # Convert lists to numpy arrays for metric calculation
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    all_pred_classes = np.array(all_pred_classes)
    
    # Calculate metrics
    val_auc = roc_auc_score(all_targets, all_predictions)
    val_accuracy = accuracy_score(all_targets, all_pred_classes)
    val_precision = precision_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    val_recall = recall_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    
    return val_loss, val_auc, val_accuracy, val_precision, val_recall

In [16]:
# Training loop
print("Starting training...")
best_val_auc = 0.0

for epoch in range(NUM_EPOCHS):
    # Train one epoch
    train_loss, train_auc, train_acc, train_precision, train_recall = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    
    # Validate
    val_loss, val_auc, val_acc, val_precision, val_recall = validate(
        model, val_loader, criterion, device
    )
    
    # Add scheduler step, using validation AUC
    scheduler.step(val_auc)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Train - Loss: {train_loss:.4f}, AUC: {train_auc:.4f}, Acc: {train_acc:.4f}, Prec: {train_precision:.4f}, Recall: {train_recall:.4f}")
    print(f"Val   - Loss: {val_loss:.4f}, AUC: {val_auc:.4f}, Acc: {val_acc:.4f}, Prec: {val_precision:.4f}, Recall: {val_recall:.4f}")
    
    # Save the best model
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Model saved with validation AUC: {val_auc:.4f}")


# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Predict on test data
model.eval()
test_predictions = []

Starting training...


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 1/11
Train - Loss: 0.2578, AUC: 0.9660, Acc: 0.8869, Prec: 0.8869, Recall: 0.8869
Val   - Loss: 0.1749, AUC: 0.9996, Acc: 0.9241, Prec: 0.9340, Recall: 0.9241
Model saved with validation AUC: 0.9996


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 2/11
Train - Loss: 0.0523, AUC: 0.9985, Acc: 0.9855, Prec: 0.9855, Recall: 0.9855
Val   - Loss: 0.0372, AUC: 0.9999, Acc: 0.9884, Prec: 0.9887, Recall: 0.9884
Model saved with validation AUC: 0.9999


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 3/11
Train - Loss: 0.0275, AUC: 0.9995, Acc: 0.9932, Prec: 0.9932, Recall: 0.9932
Val   - Loss: 0.0114, AUC: 1.0000, Acc: 0.9974, Prec: 0.9974, Recall: 0.9974
Model saved with validation AUC: 1.0000


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 4/11
Train - Loss: 0.0203, AUC: 0.9997, Acc: 0.9949, Prec: 0.9949, Recall: 0.9949
Val   - Loss: 0.0409, AUC: 1.0000, Acc: 0.9855, Prec: 0.9859, Recall: 0.9855
Model saved with validation AUC: 1.0000


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 5/11
Train - Loss: 0.0140, AUC: 0.9999, Acc: 0.9965, Prec: 0.9965, Recall: 0.9965
Val   - Loss: 0.0096, AUC: 1.0000, Acc: 0.9971, Prec: 0.9971, Recall: 0.9971
Model saved with validation AUC: 1.0000


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 6/11
Train - Loss: 0.0132, AUC: 0.9999, Acc: 0.9964, Prec: 0.9964, Recall: 0.9964
Val   - Loss: 0.0053, AUC: 1.0000, Acc: 0.9987, Prec: 0.9987, Recall: 0.9987
Model saved with validation AUC: 1.0000


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 7/11
Train - Loss: 0.0086, AUC: 1.0000, Acc: 0.9980, Prec: 0.9980, Recall: 0.9980
Val   - Loss: 0.0018, AUC: 1.0000, Acc: 1.0000, Prec: 1.0000, Recall: 1.0000
Model saved with validation AUC: 1.0000


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 8/11
Train - Loss: 0.0092, AUC: 0.9999, Acc: 0.9973, Prec: 0.9973, Recall: 0.9973
Val   - Loss: 0.0357, AUC: 1.0000, Acc: 0.9891, Prec: 0.9893, Recall: 0.9891


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 9/11
Train - Loss: 0.0081, AUC: 0.9999, Acc: 0.9979, Prec: 0.9979, Recall: 0.9979
Val   - Loss: 0.0022, AUC: 1.0000, Acc: 0.9997, Prec: 0.9997, Recall: 0.9997


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 10/11
Train - Loss: 0.0071, AUC: 1.0000, Acc: 0.9980, Prec: 0.9980, Recall: 0.9980
Val   - Loss: 0.0022, AUC: 1.0000, Acc: 0.9994, Prec: 0.9994, Recall: 0.9994


Training:   0%|          | 0/875 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 11/11
Train - Loss: 0.0071, AUC: 1.0000, Acc: 0.9979, Prec: 0.9979, Recall: 0.9979
Val   - Loss: 0.0262, AUC: 1.0000, Acc: 0.9897, Prec: 0.9899, Recall: 0.9897


In [17]:
with torch.no_grad():
    for features in tqdm(test_loader, desc="Predicting on test data"):
        features = features.to(device)
        outputs = model(features)
        test_predictions.extend(outputs.cpu().numpy())

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'is_tts': np.array(test_predictions).flatten()
})

# Ensure predictions are probabilities between 0 and 1
submission['is_tts'] = submission['is_tts'].clip(0, 1)

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Predicting on test data:   0%|          | 0/83 [00:00<?, ?it/s]

Submission file created: submission.csv
