# üéôÔ∏è DEEPFAKE-AUDIO: Real-Time Voice Cloning

Clone any voice in seconds! Choose your input method below:
- üî¥ **Record**: Use your microphone directly.
- üìÇ **Upload**: Upload a WAV/MP3 file.
- üéµ **Preset**: Choose a celebrity voice.

---

## ‚òÅÔ∏è Cloud Environment Setup (Colab/Kaggle)
Run this cell ONLY if you are using Google Colab or Kaggle. It will clone the repository and install dependencies.

In [None]:
import os
import sys

# Check if running in Colab
try:
    shell = get_ipython()
    if 'google.colab' in str(shell):
        print("üíª Running on Google Colab. Setting up...")
        if not os.path.exists("TEST"):
            shell.system("git clone https://github.com/Amey-Thakur/TEST")
        
        os.chdir("/content/TEST")
        print("üîÑ Pulling latest changes from GitHub...")
        shell.system("git pull")
        
        shell.system("apt-get install -y libsndfile1")
        
        print("üì¶ Installing dependencies...")
        shell.system("pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy matplotlib>=3.7 Pillow>=10.2 soundfile huggingface_hub")
        print("‚úÖ Environment setup complete.")
    else:
        print("üè† Running in local or custom environment.")
except NameError:
    print("üè† Running in local or custom environment.")

## 1Ô∏è‚É£ Setup Environment and Download Models
**Sources Checked:**
1. `Dataset/` (Repository Local)
2. Kaggle Dataset (`/kaggle/input/deepfakeaudio/`)
3. HuggingFace Auto-Download (Fallback)

In [None]:
import sys
import os
from pathlib import Path
import zipfile
import shutil

# Add Source Code to path
source_path = os.path.abspath("Source Code")
if source_path not in sys.path:
    sys.path.append(source_path)

print(f"üìÇ Current Working Directory: {os.getcwd()}")
print(f"‚úÖ Source code path added: {source_path}")

extract_path = "pretrained_models"
zip_path = "Dataset/pretrained.zip"

# Ensure extract path exists
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# --- ü§ñ AUTO-DOWNLOAD / EXTRACT FIX ---
print("‚¨áÔ∏è Checking model availability...")

# Check if the 3 core models are in Dataset/ (Priority 1)
core_models = ["encoder.pt", "synthesizer.pt", "vocoder.pt"]
dataset_models_present = all([os.path.exists(os.path.join("Dataset", m)) for m in core_models])

if dataset_models_present:
     print("‚úÖ Found local models in Dataset/. Using them.")
else:
    print("‚ö†Ô∏è Models missing in Dataset/. Checking backups...")
    
    # 3. Fallback > Auto Download from HuggingFace
    try:
        from utils.default_models import ensure_default_models
        ensure_default_models(Path("pretrained_models"))
        print("‚úÖ Models ensured/downloaded via HuggingFace.")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not auto-download models: {e}")

## 2Ô∏è‚É£ Load Models
Make sure you are using a T4 GPU Runtime.

In [None]:
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder
import numpy as np
import torch
from pathlib import Path

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üéØ Using device: {device}")

def resolve_checkpoint(component_name, legacy_path_suffix):
    # 1. Dataset/ Check (Highest Priority - Repo)
    dataset_p = Path("Dataset") / f"{component_name.lower()}.pt"
    if dataset_p.exists():
        print(f"üü¢ Using local model from Dataset/: {dataset_p}")
        return dataset_p

    # 2. Kaggle Dataset Check (Priority 2 - Kaggle Environment)
    kaggle_p = Path("/kaggle/input/deepfakeaudio") / f"{component_name.lower()}.pt"
    if kaggle_p.exists():
        print(f"üü¢ Using Kaggle dataset model: {kaggle_p}")
        return kaggle_p
    
    # 3. Auto-Downloaded (Pretrained_models/default)
    default_p = Path("pretrained_models/default") / f"{component_name.lower()}.pt"
    if default_p.exists():
        print(f"üü¢ Using auto-downloaded model: {default_p}")
        return default_p

    # 4. Legacy Extraction
    legacy_p = Path("pretrained_models") / legacy_path_suffix
    if legacy_p.exists():
         # Check recursion/file inside dir
         if legacy_p.is_dir():
             pts = [f for f in legacy_p.glob("*.pt") if f.is_file()]
             if pts: return pts[0]
             pts_rec = [f for f in legacy_p.rglob("*.pt") if f.is_file()]
             if pts_rec: return pts_rec[0]
         return legacy_p
            
    print(f'‚ö†Ô∏è Warning: Checkpoint for {component_name} not found!')
    return None

print("‚è≥ Loading models...")

try:
    # Load Encoder
    encoder_path = resolve_checkpoint("Encoder", "encoder/saved_models")
    encoder.load_model(encoder_path)

    # Load Synthesizer
    synth_path = resolve_checkpoint("Synthesizer", "synthesizer/saved_models/logs-pretrained/taco_pretrained")
    synthesizer = Synthesizer(synth_path)

    # Load Vocoder
    vocoder_path = resolve_checkpoint("Vocoder", "vocoder/saved_models/pretrained")
    vocoder.load_model(vocoder_path)

    print("‚úÖ Models initialized successfully!")
except Exception as e:
    print(f"‚ùå Error initializing models: {e}")

## 3Ô∏è‚É£ Clone Your Voice!
Select your input method below.

In [None]:
import ipywidgets as widgets
from IPython.display import display, Javascript, Audio
from google.colab import output
from base64 import b64decode
import io
import librosa
import soundfile as sf

# --- AUDIO RECORDING JS ---
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})"""

def record_audio(sec=10):
    print("üî¥ Recording for %d seconds..." % sec)
    display(Javascript(RECORD))
    s = output.eval_js('record(%d)' % (sec*1000))
    print("‚úÖ Recording complete.")
    binary = b64decode(s.split(',')[1])
    with open('recording.wav', 'wb') as f:
        f.write(binary)
    return 'recording.wav'

# --- UI SETUP ---
print("Select Input Method:")
tab = widgets.Tab()

# Tab 1: Presets
# Determine samples directory
sample_roots = [
    "Source Code/samples",
    "Dataset/samples",
    "/kaggle/input/deepfakeaudio/samples"
]
samples_dir = "Source Code/samples" # Default
for d in sample_roots:
    if os.path.exists(d) and len(os.listdir(d)) > 0:
        samples_dir = d
        print(f"üìÇ Loading samples from: {d}")
        break

preset_files = [f for f in os.listdir(samples_dir) if f.endswith(".wav") or f.endswith(".mp3")]
preset_files.sort()
# Ensure sensible ordering or default
if "Donald Trump.wav" in preset_files:
    preset_files.insert(0, preset_files.pop(preset_files.index("Donald Trump.wav")))

dropdown = widgets.Dropdown(options=preset_files, description='Preset:')
tab1 = widgets.VBox([dropdown])

# Tab 2: Upload
uploader = widgets.FileUpload(accept='.wav,.mp3', multiple=False)
tab2 = widgets.VBox([uploader])

# Tab 3: Record
record_btn = widgets.Button(description="Start Recording (10s)", button_style='danger')
record_out = widgets.Output()
def on_record_click(b):
    with record_out:
        record_btn.disabled = True
        try:
            record_audio(10)
        except Exception as e:
             print(f"Error: {e}. (Recording only works in Colab/Browser)")
        record_btn.disabled = False
record_btn.on_click(on_record_click)
tab3 = widgets.VBox([record_btn, record_out])

tab.children = [tab1, tab2, tab3]
tab.set_title(0, 'üéµ Presets')
tab.set_title(1, 'üìÇ Upload')
tab.set_title(2, 'üî¥ Record')
display(tab)

text_input = widgets.Textarea(
    value='Hello! This is a real-time voice cloning test. The quality is truly amazing.',
    placeholder='Type something...',
    description='Text:',
    disabled=False,
    layout=widgets.Layout(width='50%', height='100px')
)

clone_btn = widgets.Button(description="Clone Voice! üöÄ", button_style='primary')
out = widgets.Output()

display(text_input, clone_btn, out)

def run_cloning(b):
    with out:
        out.clear_output()
        active_tab = tab.selected_index
        input_path = None
        
        try:
            if active_tab == 0: # Preset
                 input_path = os.path.join(samples_dir, dropdown.value)
                 print(f"Using Preset: {dropdown.value}")
            
            elif active_tab == 1: # Upload
                 if not uploader.value:
                     print("‚ùå Please upload a file first!")
                     return
                 # Get file content (widgets changed in recent versions)
                 fname = list(uploader.value.keys())[0]
                 content = uploader.value[fname]['content']
                 input_path = "uploaded_sample.wav"
                 with open(input_path, "wb") as f:
                     f.write(content)
                 print(f"Using Upload: {fname}")
            
            elif active_tab == 2: # Record
                 if not os.path.exists("recording.wav"):
                     print("‚ùå Please record audio first!")
                     return
                 input_path = "recording.wav"
                 print("Using Recording")
            
            # Process
            print("‚è≥ Processing audio... (this may take a few seconds)")
            original_wav, sampling_rate = librosa.load(input_path)
            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            specs = synthesizer.synthesize_spectrograms([text_input.value], [embed])
            spec = specs[0]
            generated_wav = vocoder.infer_waveform(spec)
            
            print("üéâ Success! Playing result:")
            display(Audio(generated_wav, rate=synthesizer.sample_rate))
            
        except Exception as e:
            print(f"‚ùå Error: {e}")

clone_btn.on_click(run_cloning)