In [25]:
import os
import json
import torch
import soundfile as sf
from datasets import Dataset, Audio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import List, Dict, Any
import random
import re
import gc

# CRITICAL: Ensure CUDA is working from previous cell
assert torch.cuda.is_available(), "CUDA must be available! Run the CUDA fix cell first."
assert torch.cuda.device_count() > 0, "No CUDA devices found!"

print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")

# Memory management
torch.cuda.empty_cache()
gc.collect()

# Pashto Phoneme Tokenizer (same as before, but with CUDA-safe methods)
class PashtoPhonemeTokenizer:
    """CUDA-safe Pashto phoneme tokenizer"""
    
    def __init__(self):
        self.phonemes = {
            '<PAD>': '<PAD>', '<UNK>': '<UNK>', '<BOS>': '<BOS>', '<EOS>': '<EOS>', 
            '<SIL>': '<SIL>', '<WB>': '<WB>',
            'a': 'a', 'aː': 'aː', 'e': 'e', 'eː': 'eː', 'i': 'i', 'iː': 'iː',
            'o': 'o', 'oː': 'oː', 'u': 'u', 'uː': 'uː', 'ə': 'ə',
            'p': 'p', 'b': 'b', 't': 't', 'd': 'd', 'ʈ': 'ʈ', 'ɖ': 'ɖ',
            'k': 'k', 'g': 'g', 'q': 'q', 'ʔ': 'ʔ', 'f': 'f', 'v': 'v',
            's': 's', 'z': 'z', 'ʃ': 'ʃ', 'ʒ': 'ʒ', 'ʂ': 'ʂ', 'ʐ': 'ʐ',
            'x': 'x', 'ɣ': 'ɣ', 'h': 'h', 'ʣ': 'ʣ', 'ʦ': 'ʦ', 'm': 'm',
            'n': 'n', 'ɳ': 'ɳ', 'ŋ': 'ŋ', 'l': 'l', 'r': 'r', 'ɽ': 'ɽ',
            'j': 'j', 'w': 'w',
        }
        
        self.consonant_map = {
            'پ': 'p', 'ب': 'b', 'ت': 't', 'د': 'd', 'ټ': 'ʈ', 'ډ': 'ɖ',
            'ک': 'k', 'ګ': 'g', 'ق': 'q', 'ع': 'ʔ', 'ف': 'f', 'س': 's',
            'ز': 'z', 'ش': 'ʃ', 'ژ': 'ʒ', 'ښ': 'ʂ', 'ږ': 'ʐ', 'خ': 'x',
            'غ': 'ɣ', 'ح': 'h', 'ه': 'h', 'ځ': 'ʣ', 'څ': 'ʦ', 'م': 'm',
            'ن': 'n', 'ڼ': 'ɳ', 'ل': 'l', 'ر': 'r', 'ړ': 'ɽ', 'ي': 'j', 'و': 'w'
        }
        
        self.vowel_map = {
            'ا': 'a', 'آ': 'aː', 'ې': 'e', 'ي': 'i', 'و': 'u', 'ه': 'ə'
        }
        
        self.phoneme_to_id = {p: i for i, p in enumerate(self.phonemes.keys())}
        self.id_to_phoneme = {i: p for p, i in self.phoneme_to_id.items()}
        self.vocab_size = len(self.phonemes)
        
        self.pad_token_id = self.phoneme_to_id['<PAD>']
        self.unk_token_id = self.phoneme_to_id['<UNK>']
        self.bos_token_id = self.phoneme_to_id['<BOS>']
        self.eos_token_id = self.phoneme_to_id['<EOS>']
        
        self.model_input_names = ['input_ids']
        self.model_max_length = 150  # Shorter sequences for stability
    
    def normalize_text(self, text: str) -> str:
        text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
        text = re.sub(r'[\u200c\u200d\u200e\u200f]', '', text)
        text = text.replace('،', ' ').replace('؟', ' ').replace('؛', ' ').replace('!', ' ')
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def text_to_phonemes(self, text: str) -> List[str]:
        text = self.normalize_text(text)
        words = text.split()
        phonemes = ['<BOS>']
        
        for word_idx, word in enumerate(words):
            if word_idx > 0:
                phonemes.append('<WB>')
            phonemes.extend(self.word_to_phonemes(word))
        
        phonemes.append('<EOS>')
        return phonemes[:self.model_max_length]  # Truncate long sequences
    
    def word_to_phonemes(self, word: str) -> List[str]:
        phonemes = []
        for char in word:
            if char in self.consonant_map:
                phonemes.append(self.consonant_map[char])
            elif char in self.vowel_map:
                phonemes.append(self.vowel_map[char])
            elif char == 'و':
                phonemes.append('w')  # Simplified waw handling
            elif char.strip():
                phonemes.append('<UNK>')
        return phonemes
    
    def __call__(self, texts, padding=True, truncation=True, max_length=None, return_tensors="pt"):
        if max_length is None:
            max_length = self.model_max_length
        
        if isinstance(texts, str):
            texts = [texts]
        
        all_input_ids = []
        all_attention_masks = []
        
        for text in texts:
            phonemes = self.text_to_phonemes(text)
            input_ids = [self.phoneme_to_id.get(p, self.unk_token_id) for p in phonemes]
            
            # Ensure all token IDs are valid
            input_ids = [min(id, self.vocab_size - 1) for id in input_ids]
            
            if truncation and len(input_ids) > max_length:
                input_ids = input_ids[:max_length-1] + [self.eos_token_id]
            
            attention_mask = [1] * len(input_ids)
            
            if padding and len(input_ids) < max_length:
                padding_length = max_length - len(input_ids)
                input_ids.extend([self.pad_token_id] * padding_length)
                attention_mask.extend([0] * padding_length)
            
            all_input_ids.append(input_ids)
            all_attention_masks.append(attention_mask)
        
        result = {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks
        }
        
        if return_tensors == "pt":
            # Create tensors on CPU first, then move to GPU in training
            result = {k: torch.tensor(v, dtype=torch.long) for k, v in result.items()}
        
        return result
    
    def get_vocab(self):
        return self.phoneme_to_id.copy()

# Custom processor
class CUDASafeSpeechT5Processor:
    def __init__(self, phoneme_tokenizer):
        self.original_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.tokenizer = phoneme_tokenizer
        self.feature_extractor = self.original_processor.feature_extractor
    
    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        self.feature_extractor.save_pretrained(save_directory)
        vocab_file = os.path.join(save_directory, "phoneme_vocab.json")
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(self.tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)

# Initialize components
print("Initializing phoneme tokenizer...")
phoneme_tokenizer = PashtoPhonemeTokenizer()
processor = CUDASafeSpeechT5Processor(phoneme_tokenizer)

# Test tokenizer
test_text = "ښه راغلاست"
test_result = phoneme_tokenizer(test_text)
print(f"Test: {test_text} -> {test_result['input_ids'].shape}")

# Setup paths
PASHTO_DATA_JSON = r"C:\Users\PC\Music\jj\new6.json"
LOCAL_AUDIO_DIR = r"C:\Users\PC\Downloads\AudioFiles"
OUTPUT_DIR = "s2t5_pashto_cuda_tts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load model with CUDA safety
print("Loading SpeechT5 model...")

# Load on CPU first to avoid CUDA issues during initialization
model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts", 
    torch_dtype=torch.float32,
    device_map=None  # Load on CPU first
)

print(f"Original vocab size: {model.config.vocab_size}")
print(f"New vocab size: {phoneme_tokenizer.vocab_size}")

# Resize embeddings
model.resize_token_embeddings(phoneme_tokenizer.vocab_size)

# Update config
model.config.vocab_size = phoneme_tokenizer.vocab_size
model.config.pad_token_id = phoneme_tokenizer.pad_token_id
model.config.bos_token_id = phoneme_tokenizer.bos_token_id
model.config.eos_token_id = phoneme_tokenizer.eos_token_id

# Move to GPU safely
print("Moving model to GPU...")
try:
    model = model.to('cuda:0')
    torch.cuda.empty_cache()
    print("✅ Model on GPU")
except Exception as e:
    print(f"❌ GPU move failed: {e}")
    raise

# Dataset loading (same as before but smaller)
def load_pashto_dataset(json_file_path: str, max_samples: int = 50):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if max_samples and len(data) > max_samples:
        random.seed(42)
        data = random.sample(data, max_samples)
    
    return Dataset.from_dict({
        'audio_url': [item['file_url'] for item in data],
        'text': [item['sentence'] for item in data],
        'speaker_id': [f"{item['gender']}_{item['accent']}" for item in data]
    })

def load_local_audio(example, idx=None):
    filename = os.path.basename(example['audio_url'])
    local_path = os.path.join(LOCAL_AUDIO_DIR, filename)
    
    if os.path.isfile(local_path):
        try:
            audio_array, sample_rate = sf.read(local_path)
            if len(audio_array.shape) > 1:
                audio_array = audio_array.mean(axis=1)
            
            # Limit audio length to prevent memory issues
            max_samples = 16000 * 4  # 4 seconds max
            if len(audio_array) > max_samples:
                audio_array = audio_array[:max_samples]
            
            example['audio'] = {'array': audio_array, 'sampling_rate': sample_rate}
        except Exception as e:
            print(f"Audio error {filename}: {e}")
            example['audio'] = None
    else:
        example['audio'] = None
    
    return example

# Load dataset
print("Loading dataset...")
dataset = load_pashto_dataset(PASHTO_DATA_JSON, max_samples=50)
dataset = dataset.map(load_local_audio, with_indices=True)
dataset = dataset.filter(lambda x: x['audio'] is not None)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

print(f"Final dataset size: {len(dataset)}")

# CUDA-safe data collator
@dataclass
class CUDASafeDataCollator:
    processor: CUDASafeSpeechT5Processor
    
    def __call__(self, features):
        valid_features = [f for f in features if f['audio'] is not None]
        texts = [f["text"] for f in valid_features]
        
        # Tokenize on CPU
        text_inputs = self.processor.tokenizer(
            texts, padding=True, truncation=True, 
            max_length=100, return_tensors="pt"
        )
        
        # Process audio with memory limits
        max_audio_len = 16000 * 3  # 3 seconds max
        audio_features = []
        
        for f in valid_features:
            audio = torch.tensor(f["audio"]["array"], dtype=torch.float32)
            if len(audio) > max_audio_len:
                audio = audio[:max_audio_len]
            audio_features.append(audio)
        
        # Pad audio
        if audio_features:
            max_len = min(max([len(a) for a in audio_features]), max_audio_len)
            padded_audio = []
            
            for audio in audio_features:
                if len(audio) < max_len:
                    padding = torch.zeros(max_len - len(audio))
                    padded_audio.append(torch.cat([audio, padding]))
                else:
                    padded_audio.append(audio[:max_len])
            
            labels = torch.stack(padded_audio, dim=0)
        else:
            labels = torch.zeros(len(valid_features), 1000)
        
        speaker_embeddings = torch.zeros(len(valid_features), 512)
        
        return {
            "input_ids": text_inputs["input_ids"],
            "attention_mask": text_inputs["attention_mask"], 
            "labels": labels,
            "speaker_embeddings": speaker_embeddings,
        }

data_collator = CUDASafeDataCollator(processor=processor)

# CUDA-optimized training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,  # Minimal batch size
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    num_train_epochs=2,
    logging_steps=1,
    save_steps=10,
    warmup_steps=2,
    
    # CUDA-specific settings
    fp16=False,  # Disable mixed precision for stability
    bf16=False,
    dataloader_drop_last=True,
    dataloader_num_workers=0,  # No multiprocessing
    
    # Memory management
    max_grad_norm=0.5,  # Gradient clipping
    remove_unused_columns=False,
    report_to=None,
    save_safetensors=False,
    
    # Reproducibility
    seed=42,
    data_seed=42,
)

# Create trainer with error handling
print("Creating trainer...")
try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    print("✅ Trainer created successfully")
    
    # Clear cache before training
    torch.cuda.empty_cache()
    gc.collect()
    
    print("Starting training...")
    print(f"GPU Memory before training: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
    
    # Start training with monitoring
    trainer.train()
    
    print("✅ Training completed!")
    
except torch.cuda.OutOfMemoryError as e:
    print(f"❌ CUDA OOM: {e}")
    print("Try reducing batch_size to 1 and gradient_accumulation_steps")
    
except RuntimeError as e:
    if "device-side assert" in str(e):
        print(f"❌ CUDA device assert: {e}")
        print("Check token IDs are within vocabulary range")
    else:
        print(f"❌ Runtime error: {e}")
        
except Exception as e:
    print(f"❌ Training error: {e}")
    import traceback
    traceback.print_exc()

finally:
    # Save whatever we can
    try:
        print("Saving models...")
        trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
        processor.save_pretrained(os.path.join(OUTPUT_DIR, "final_processor"))
        print(f"✅ Models saved to {OUTPUT_DIR}")
    except Exception as e:
        print(f"Save error: {e}")
    
    # Final cleanup
    torch.cuda.empty_cache()
    gc.collect()

print("Training process complete!")

✅ Using GPU: NVIDIA GeForce RTX 5060 Ti
Initializing phoneme tokenizer...
Test: ښه راغلاست -> torch.Size([1, 150])
Loading SpeechT5 model...
Original vocab size: 81
New vocab size: 49
Moving model to GPU...
❌ GPU move failed: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
