<a href="https://colab.research.google.com/github/AlvinSMoyo/2XYDqXDc6wzA716j/blob/main/notebooks/MonReader_Clone_Your_Voice_Flask_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🔧 Sanitize notebook metadata so GitHub can preview it
# - Finds the notebook under Drive/Colab Notebooks
# - Removes metadata.widgets or adds an empty state

import nbformat, glob, os, sys

NAME = "MonReader_Clone_Your_Voice_Flask_API.ipynb"

# 1) Mount Drive if not already
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass

# 2) Search common locations
cands = []
cands += glob.glob(f"/content/{NAME}")
cands += glob.glob(f"/content/**/*.ipynb", recursive=True)
cands += glob.glob(f"/content/drive/MyDrive/**/*.ipynb", recursive=True)

target = None
for p in cands:
    if os.path.basename(p) == NAME:
        target = p
        break

if not target:
    raise FileNotFoundError(f"Could not find {NAME}. If you saved it elsewhere, set target path manually.")

print("Found notebook:", target)

# 3) Load, sanitize, write
nb = nbformat.read(target, as_version=4)
if "widgets" in nb.get("metadata", {}):
    nb["metadata"].pop("widgets", None)   # safest: drop widgets metadata
# Alternatively: add an empty state
# nb.setdefault("metadata", {}).setdefault("widgets", {}).setdefault("state", {})

nbformat.write(nb, target)
print("Sanitized and saved:", target)

In [None]:
# ====================================================================
# STEP 1 — Definitive Setup (with Dependency Fixes)
# ====================================================================

# --- 1. Install System Libraries ---
print("Installing system dependencies...")
# Use -qq to make the output cleaner
!apt-get update -qq && apt-get install -y -qq portaudio19-dev ffmpeg

# --- 2. Install Python Libraries in a Specific Order ---
print("\nInstalling Python libraries...")

# First, clone and install 'csm' which has strict requirements
!git clone https://github.com/SesameAILabs/csm.git
%cd csm
# This will install its required version of huggingface-hub (0.28.1) and torch
!pip install -q -e .
%cd ..

# Next, install gradio WITHOUT its dependencies to avoid the huggingface-hub conflict
!pip install -q --no-deps gradio

# Finally, install the rest of the required libraries
!pip install -q openai groq sounddevice scipy flask flask-cors werkzeug

# --- 3. Authenticate with Hugging Face ---
from google.colab import userdata
from huggingface_hub import login
print("\nAuthenticating with Hugging Face...")
HF_TOKEN = userdata.get("HF_TOKEN")
login(token=HF_TOKEN)
print("✅ Hugging Face authentication complete.")

In [None]:
# ============================================================
# STEP 2 — Prepare Voices (Mount Drive + Clean Prompt Clips)
# ============================================================
from google.colab import drive
import os, glob, subprocess, shlex

# --- 2.1 Mount Google Drive ---
drive.mount('/content/drive', force_remount=True)
print("✅ Google Drive mounted at /content/drive")

# --- 2.2 Clean & Normalize Prompt Clips ---
VOICES_DIR = "/content/drive/MyDrive/My_Voice_API_Files/voices"
TARGET_SR  = 24000  # must match your backend SAMPLE_RATE

# Sanity check: voices dir exists?
if not os.path.isdir(VOICES_DIR):
    raise FileNotFoundError(
        f"Voices folder not found at: {VOICES_DIR}\n"
        "➡️ Create it like /My_Voice_API_Files/voices/<VoiceName>/<your_prompt>.wav"
    )

# Optional: show ffmpeg version (helps debugging audio issues live)
try:
    _ = subprocess.run(["ffmpeg", "-version"], capture_output=True, check=False)
except Exception:
    print("⚠️ ffmpeg not found in PATH. Make sure Step 1 installed it.")

print("🧼 Cleaning & normalizing reference .wav files...")
total_cleaned = 0
voice_dirs = [d for d in sorted(os.listdir(VOICES_DIR)) if os.path.isdir(os.path.join(VOICES_DIR, d))]

if not voice_dirs:
    print("⚠️ No subfolders found in voices directory. Add at least one voice folder.")
else:
    for voice in voice_dirs:
        vdir = os.path.join(VOICES_DIR, voice)
        # Original WAVs that don't already have a _clean version
        originals = [f for f in glob.glob(os.path.join(vdir, "*.wav")) if not f.endswith("_clean.wav")]
        if not originals:
            print(f"☑️ Nothing new to clean in: {vdir}")
            continue

        for wav in originals:
            out = wav[:-4] + "_clean.wav"
            cmd = f'''ffmpeg -y -i "{wav}" -ac 1 -ar {TARGET_SR} \
-af "highpass=f=80,lowpass=f=11000,\
silenceremove=start_periods=1:start_silence=0.4:start_threshold=-40dB,\
loudnorm=I=-18:TP=-1.0:LRA=11" "{out}"'''
            # Run, but don't crash the whole cell if one file fails
            try:
                subprocess.run(shlex.split(cmd), check=True, capture_output=True)
                total_cleaned += 1
                print(f"✅ Cleaned → {out}")
            except subprocess.CalledProcessError as e:
                print(f"❌ FFmpeg failed for {wav}: {e.stderr.decode('utf-8', errors='ignore')[:300]}")

print(f"🎯 Cleaning complete. New files created: {total_cleaned}")
print("ℹ️ Your backend will prefer *_clean.wav files if present.")

In [None]:
# ====================================================================
# STEP 3 — Start the Backend Voice Cloning API
# ====================================================================
import sys, os, re, base64, torch, torchaudio, unicodedata
from pathlib import Path
from threading import Thread
from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.serving import run_simple

# --- CSM imports ---
sys.path.append('/content/csm')
from generator import load_csm_1b, Segment

# -----------------------------
# Config & Constants
# -----------------------------
SAMPLE_RATE = 24000
VOICES_DIR  = Path("/content/drive/MyDrive/My_Voice_API_Files/voices")

VOICE_TRANSCRIPTS = {
    "MyVoice":   "The quick brown fox jumps over the lazy dog; that is a fact. Should we chase those azure clouds and judge their graceful, quiet movement? For my voice to be cloned with vision and expertise, I must speak this very sentence. My name is Alvin Moyo?",
    "MyVoice2":  "What does it take to build an AI that truly understands a resume? I am Alvin Moyo; in this video I will walk you through the journey of how I built and evolved a resume ranking pipeline. Starting traditional machine learning and pushing towards something smarter.",
    "SemihVoice":"My name is Semih; I am the Director of AI at Apziva. We are very pleased to have you with us today. Today we have Chris Turner with us. Chris is an AI expert and, also an AI resident at Apziva. A background in genetics and biology with years of experience in"
}

# -----------------------------
# Load model
# -----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {DEVICE}")
print("🧠 Loading CSM model...")
generator = load_csm_1b(device=DEVICE)
print("✅ Model loaded.")

# -----------------------------
# Build voice anchors (prefer *_clean.wav; ensure mono + 24k)
# -----------------------------
if not VOICES_DIR.exists():
    raise FileNotFoundError(f"Voices directory not found: {VOICES_DIR}")

VOICE_PROMPTS = {}
print("🔊 Loading voice prompts...")
for voice_folder in sorted(p for p in VOICES_DIR.iterdir() if p.is_dir()):
    try:
        clean = sorted(voice_folder.glob("*_clean.wav"))
        raw   = sorted([p for p in voice_folder.glob("*.wav") if not p.name.endswith("_clean.wav")])
        if not (clean or raw):
            print(f"⚠️ No .wav files in '{voice_folder.name}' — skipping.")
            continue
        prompt_file = (clean or raw)[0]

        audio, sr = torchaudio.load(prompt_file)
        if audio.shape[0] > 1:
            audio = audio.mean(dim=0, keepdim=True)
        if sr != SAMPLE_RATE:
            audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE)

        prompt_text = VOICE_TRANSCRIPTS.get(voice_folder.name, "A generic prompt.")
        VOICE_PROMPTS[voice_folder.name] = Segment(
            speaker=0, text=prompt_text, audio=audio.squeeze(0).contiguous()
        )
        print(f"✅ Loaded '{voice_folder.name}' → {prompt_file.name}")
    except Exception as e:
        print(f"❌ Error loading voice '{voice_folder.name}': {e}")

# Warmup (optional)
if VOICE_PROMPTS:
    any_anchor = next(iter(VOICE_PROMPTS.values()))
    with torch.inference_mode():
        _ = generator.generate(text="Hello.", speaker=0, context=[any_anchor],
                               max_audio_length_ms=1500, temperature=0.7)
else:
    print("⚠️ No voices loaded. /generate will return 503 until voices are available.")

# -----------------------------
# Helpers
# -----------------------------
def split_into_sentences(text: str):
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text.strip()) if s.strip()]

@torch.inference_mode()
def generate_long_form_speech(text: str, anchor: Segment, wps: float, temp: float):
    text = unicodedata.normalize("NFKC", text or "").strip()
    sentences = split_into_sentences(text)[:20]  # cap to 20 sentences for demo safety
    if not sentences:
        return torch.zeros(SAMPLE_RATE // 2, dtype=torch.float32)

    wps  = float(max(1.2, min(wps, 3.0)))
    temp = float(max(0.2, min(temp, 1.0)))

    parts = []
    pause = torch.zeros(int(0.25 * SAMPLE_RATE), dtype=torch.float32)
    for s in sentences:
        max_ms = int(max(3500, (len(s.split()) / wps) * 1000 * 1.2))
        chunk = generator.generate(text=s, speaker=0, context=[anchor],
                                   max_audio_length_ms=max_ms, temperature=temp)
        chunk = chunk.to(dtype=torch.float32).contiguous().cpu()
        parts.extend([chunk, pause])

    final = torch.cat(parts[:-1]) if len(parts) > 1 else parts[0]
    peak = final.abs().max().item()
    if peak > 1e-6:
        final = final / peak
    final = torch.clamp(final * 1.2, -1.0, 1.0)  # gentle lift
    return final

# -----------------------------
# Flask API
# -----------------------------
app = Flask(__name__)
CORS(app)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "ok",
        "device": DEVICE,
        "voices": sorted(list(VOICE_PROMPTS.keys())),
        "sample_rate": SAMPLE_RATE
    })

@app.route('/generate', methods=['POST'])
def generate_endpoint():
    if not VOICE_PROMPTS:
        return jsonify({"error": "No voices available on server. Please add prompts and reload."}), 503

    data = request.get_json(silent=True) or {}
    text = (data.get("text") or "").strip()
    if not text:
        return jsonify({"error": "Missing 'text'"}), 400
    if len(text) > 600:
        return jsonify({"error": "Text too long for demo; please shorten to ≤600 chars."}), 400

    voice_name = data.get("voice", "MyVoice")
    if voice_name not in VOICE_PROMPTS:
        return jsonify({"error": f"Voice '{voice_name}' not found."}), 400

    wps  = float(data.get("words_per_sec", 2.0))
    temp = float(data.get("temperature", 0.75))

    audio = generate_long_form_speech(text, VOICE_PROMPTS[voice_name], wps, temp)
    audio = audio.unsqueeze(0)  # (1, T)

    torchaudio.save("output.wav", audio, SAMPLE_RATE, encoding="PCM_S", bits_per_sample=16)
    with open("output.wav", "rb") as f:
        encoded = base64.b64encode(f.read()).decode("utf-8")
    os.remove("output.wav")

    return jsonify({"audio_data": encoded, "sample_rate": SAMPLE_RATE, "mime": "audio/wav"})

# -----------------------------
# Run server (threaded for notebooks)
# -----------------------------
def run_server():
    run_simple("0.0.0.0", 8000, app, threaded=True)

Thread(target=run_server, daemon=True).start()
print("🚀 Backend API Server is now running on :8000")

In [None]:
# ====================================================================
# STEP 4 — Launch the Gradio Conversational App (GPT Brain & Advanced Tuning)
# ====================================================================
import os, base64, requests, gradio as gr
import openai

# --- 1) OpenAI client (env first, Colab Secrets fallback) ---
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
    try:
        from google.colab import userdata  # only works in Colab
        api_key = userdata.get("OPENAI_API_KEY")
    except Exception:
        api_key = None
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not found. Set it in environment or Colab Secrets.")
openai_client = openai.OpenAI(api_key=api_key)

# --- 2) Config ---
VOICE_CLONE_URL = "http://127.0.0.1:8000/generate"
HEALTH_URL      = "http://127.0.0.1:8000/health"

ASR_PRIMARY  = "gpt-4o-mini-transcribe"  # STT (primary)
ASR_FALLBACK = "whisper-1"               # STT (fallback)
GPT_MODEL    = "gpt-4o"                  # Brain

# --- 3) Resolve voice list (prefer in-notebook VOICE_PROMPTS; else /health) ---
if "VOICE_PROMPTS" in globals() and isinstance(VOICE_PROMPTS, dict) and VOICE_PROMPTS:
    VOICE_CHOICES = sorted(list(VOICE_PROMPTS.keys()))
else:
    try:
        resp = requests.get(HEALTH_URL, timeout=5)
        VOICE_CHOICES = resp.json().get("voices", []) if resp.ok else []
    except Exception:
        VOICE_CHOICES = []
if not VOICE_CHOICES:
    VOICE_CHOICES = ["MyVoice"]  # fallback placeholder
DEFAULT_VOICE = VOICE_CHOICES[0]

# --- 4) Core helpers ---
def transcribe_audio(mic_path: str) -> str:
    """Transcribe microphone audio with 4o-mini-transcribe; fallback to whisper-1."""
    if not mic_path:
        return ""
    try:
        with open(mic_path, "rb") as f:
            r = openai_client.audio.transcriptions.create(model=ASR_PRIMARY, file=f)
        return (getattr(r, "text", None) or str(r)).strip()
    except Exception as e:
        print(f"[ASR primary failed: {e}] Falling back to {ASR_FALLBACK}...")
        try:
            with open(mic_path, "rb") as f:
                r = openai_client.audio.transcriptions.create(model=ASR_FALLBACK, file=f, response_format="text")
            return str(r).strip()
        except Exception as e2:
            print(f"[ASR fallback failed: {e2}]")
            return ""

def get_gpt_response(user_text: str) -> str:
    if not user_text:
        return ""
    try:
        out = openai_client.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful, concise assistant. Keep replies to 1–2 short sentences."},
                {"role": "user", "content": user_text}
            ],
            temperature=0.5,
            max_tokens=120
        )
        return out.choices[0].message.content.strip()
    except Exception as e:
        print(f"[GPT error] {e}")
        return "Sorry, I hit an error generating a reply."

def tts_request(text: str, voice: str, wps: float, temp: float) -> tuple[str|None, str]:
    """Call your local Flask TTS API and return (audio_path, message)."""
    if not text or not text.strip():
        return None, "Please provide some text."
    payload = {"text": text.strip(), "voice": voice, "words_per_sec": float(wps), "temperature": float(temp)}
    try:
        resp = requests.post(VOICE_CLONE_URL, json=payload, timeout=(9, 60))
        if resp.status_code != 200:
            return None, f"TTS error: HTTP {resp.status_code} — {resp.text}"
        data = resp.json()
        b = base64.b64decode(data["audio_data"])
        out_path = "ai_voice.wav"
        with open(out_path, "wb") as f:
            f.write(b)
        return out_path, text.strip()
    except requests.Timeout:
        return None, "TTS timeout: generation took too long."
    except Exception as e:
        return None, f"TTS exception: {e}"

# --- 5) Gradio handlers ---
def talk_to_brain(mic, voice, wps, temp):
    """Mic → Transcribe → GPT → TTS"""
    transcript = transcribe_audio(mic)
    if not transcript:
        return None, "I couldn't hear that—please try again."
    reply = get_gpt_response(transcript)
    wav, msg = tts_request(reply, voice, wps, temp)
    return wav, (reply if wav else msg)

def type_to_voice(text, voice, wps, temp):
    """Type → TTS (reads exactly what was typed)"""
    wav, msg = tts_request(text, voice, wps, temp)
    return wav, msg

# --- 6) UI: two tabs, one app ---
with gr.Blocks(title="Real-Time Conversational Voice Cloning") as app:
    gr.Markdown("### Real-Time Conversational Voice Cloning\nSpeak or type, pick a voice, and hear the AI reply.")

    with gr.Tabs():
        # Tab 1: Talk to Me
        with gr.Tab("🎤 Talk to Me"):
            mic = gr.Audio(sources=["microphone"], type="filepath", label="Hold to record, then release")
            with gr.Row():
                voice1 = gr.Dropdown(VOICE_CHOICES, value=DEFAULT_VOICE, label="Choose AI Voice")
                wps1   = gr.Slider(1.5, 3.0, value=2.0, step=0.1, label="Words Per Second (Speed)")
                temp1  = gr.Slider(0.2, 1.0, value=0.75, step=0.05, label="Temperature (Creativity)")
            out_audio1 = gr.Audio(label="AI Response (Audio)")
            out_text1  = gr.Textbox(label="AI Reply (Text)", lines=3)
            gr.Button("Respond").click(talk_to_brain, [mic, voice1, wps1, temp1], [out_audio1, out_text1])

        # Tab 2: Type to Speak
        with gr.Tab("⌨️ Type to Speak"):
            txt = gr.Textbox(label="Type something for the AI to speak", lines=3,
                             placeholder="e.g., Welcome to my workshop!")
            with gr.Row():
                voice2 = gr.Dropdown(VOICE_CHOICES, value=DEFAULT_VOICE, label="Choose AI Voice")
                wps2   = gr.Slider(1.5, 3.0, value=2.0, step=0.1, label="Words Per Second (Speed)")
                temp2  = gr.Slider(0.2, 1.0, value=0.75, step=0.05, label="Temperature (Creativity)")
            out_audio2 = gr.Audio(label="Synthesized Audio")
            out_text2  = gr.Textbox(label="(Echo) Text Sent", lines=2)
            gr.Button("Speak").click(type_to_voice, [txt, voice2, wps2, temp2], [out_audio2, out_text2])

    gr.Markdown("> Tip: If first audio is slow, that's model warmup. Subsequent runs are faster.")

app.launch(share=True, debug=True)

=================================================================================================================


---