Install everything

In [1]:
# Install audio deps
!apt-get update -qq
!apt-get install -y build-essential wget unzip libsndfile1 espeak-ng -qq

# Python libraries
!pip install -q librosa soundfile pydub numpy scipy matplotlib
!pip install -q praat-parselmouth


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libpcaudio0:amd64.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../libpcaudio0_1.1-6build2_amd64.deb ...
Unpacking libpcaudio0:amd64 (1.1-6build2) ...
Selecting previously unselected package libsonic0:amd64.
Preparing to unpack .../libsonic0_0.2.0-11build1_amd64.deb ...
Unpacking libsonic0:amd64 (0.2.0-11build1) ...
Selecting previously unselected package espeak-ng-data:amd64.
Preparing to unpack .../espeak-ng-data_1.50+dfsg-10ubuntu0.1_amd64.deb ...
Unpacking espeak-ng-data:amd64 (1.50+dfsg-10ubuntu0.1) ...
Selecting previously unselected package libespeak-ng1:amd64.
Preparing to unpack .../libespeak-ng1_1.50+dfsg-10ubuntu0.1_amd64.deb ...
Unpacking libespeak-ng1:amd64 (1.50+dfsg-10ubuntu0.1) ...
Selecting previ

In [2]:
import parselmouth
from parselmouth.praat import call


Piper TTS setup

In [3]:
import os
os.makedirs('piper_tts', exist_ok=True)
%cd piper_tts

!wget -q https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz
!tar -xzvf piper_amd64.tar.gz
!chmod +x ./piper

!wget -q -O en_US-amy-medium.onnx \
  https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx
!wget -q -O en_US-amy-medium.onnx.json \
  https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx.json

print(" Piper TTS ready!")


/content/piper_tts
piper/
piper/libespeak-ng.so.1
piper/libespeak-ng.so.1.1.51
piper/libespeak-ng.so
piper/libonnxruntime.so.1.14.1
piper/libpiper_phonemize.so
piper/libtashkeel_model.ort
piper/libonnxruntime.so
piper/piper
piper/libpiper_phonemize.so.1
piper/espeak-ng-data/
piper/espeak-ng-data/smj_dict
piper/espeak-ng-data/qdb_dict
piper/espeak-ng-data/kk_dict
piper/espeak-ng-data/he_dict
piper/espeak-ng-data/ca_dict
piper/espeak-ng-data/my_dict
piper/espeak-ng-data/piqd_dict
piper/espeak-ng-data/ar_dict
piper/espeak-ng-data/io_dict
piper/espeak-ng-data/es_dict
piper/espeak-ng-data/shn_dict
piper/espeak-ng-data/sk_dict
piper/espeak-ng-data/fa_dict
piper/espeak-ng-data/en_dict
piper/espeak-ng-data/ne_dict
piper/espeak-ng-data/ro_dict
piper/espeak-ng-data/qu_dict
piper/espeak-ng-data/id_dict
piper/espeak-ng-data/haw_dict
piper/espeak-ng-data/eo_dict
piper/espeak-ng-data/ja_dict
piper/espeak-ng-data/ku_dict
piper/espeak-ng-data/nl_dict
piper/espeak-ng-data/phonindex
piper/espeak-ng-data

Test Piper

In [4]:
%cd /content/piper_tts
!echo "Testing Piper TTS setup" | ./piper/piper --model en_US-amy-medium.onnx --output_file test.wav
from IPython.display import Audio
Audio("test.wav")

/content/piper_tts
[2025-12-08 16:54:18.807] [piper] [[32minfo[m] Loaded voice in 0.388403367 second(s)
[2025-12-08 16:54:18.812] [piper] [[32minfo[m] Initialized piper
test.wav
[2025-12-08 16:54:19.419] [piper] [[32minfo[m] Real-time factor: 0.2882629148207075 (infer=0.595717211 sec, audio=2.066575963718821 sec)
[2025-12-08 16:54:19.419] [piper] [[32minfo[m] Terminated piper


UPLOAD YOUR VOICE (1-5 min recording)

In [6]:
from google.colab import files
print(" RECORD 1-2 minutes of yourself speaking, save as WAV/MP3, then upload:")
uploaded = files.upload()
audio_files = list(uploaded.keys())
print(f" Uploaded: {audio_files}")


 RECORD 1-2 minutes of yourself speaking, save as WAV/MP3, then upload:


Saving SelfVoice.wav to SelfVoice.wav
 Uploaded: ['SelfVoice.wav']


FULL PERSONALIZATION ENGINE

In [8]:
import librosa
import numpy as np
import json
import logging
from datetime import datetime
import subprocess
import os
from IPython.display import Audio
from google.colab import files

# logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PersonalizationEngine:
    def __init__(self, piper_path="/content/piper_tts"):
        self.piper_path = piper_path
        self.profile = {}

    def step1_audio_preprocessing(self, audio_file):
        """Step 1: User Audio Collection & Preprocessing"""
        logger.info(f" Step 1: Processing {audio_file}")
        start_time = datetime.now()

        # Load & preprocess
        audio, sr = librosa.load(audio_file, sr=22050)

        # Stats
        duration = len(audio) / sr
        rms = np.sqrt(np.mean(audio**2))
        silence_threshold = 0.01
        silence_ratio = 1 - np.mean(np.abs(audio) > silence_threshold)
        wpm = 130 * (1 - silence_ratio)

        self.profile["step1"] = {
            "duration_seconds": float(duration),
            "sample_rate_hz": 22050,
            "rms_energy": float(rms),
            "silence_ratio": float(silence_ratio),
            "estimated_wpm": float(wpm),
            "preprocessing": ["resampled_22kHz", "mono"]
        }
        logger.info(f" Step 1: {duration:.1f}s, {wpm:.0f} WPM")
        return audio, sr

    def step2_speaking_patterns(self, audio, sr):
        """Step 2: Speaking Pattern Analysis"""
        logger.info(" Step 2: Speaking patterns")

        # Simple pause detection
        frame_length = int(0.025 * sr)
        hop_length = int(0.01 * sr)
        energy = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
        pauses = energy < 0.01
        pause_count = np.sum(pauses > 10)  # Pauses > 100ms

        # Access estimated_wpm from audio_preprocessing via self.profile
        estimated_wpm = self.profile["step1"]["estimated_wpm"]
        speaking_rate_wps = float(estimated_wpm / 60.0) if estimated_wpm > 0 else 3.0

        self.profile["step2"] = {
            "pause_count": int(pause_count),
            "avg_pause_ms": 300.0,
            "speaking_rate_wps": speaking_rate_wps
        }
        logger.info(f" Step 2: {pause_count} pauses detected")

    def step3_pitch_analysis(self, audio, sr):
        """Step 3: Pitch (simple frequency analysis)"""
        logger.info(" Step 3: Pitch analysis")

        # Simple spectral centroid as pitch proxy
        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
        mean_freq = np.mean(spectral_centroids)

        self.profile["step3"] = {
            "mean_freq_hz": float(mean_freq),
            "freq_range_hz": float(np.std(spectral_centroids)),
            "pitch_estimate": "medium" if 1000 < mean_freq < 2000 else "low/high"
        }
        logger.info(f" Step 3: Mean freq {mean_freq:.0f}Hz")

    def step4_emotion_detection(self, audio, sr):
        """Step 4: Simple emotion rules"""
        logger.info(" Step 4: Emotion analysis")
        energy = np.mean(librosa.feature.rms(y=audio)[0])

        if energy > 0.08:
            emotion = "excited"
        elif energy < 0.03:
            emotion = "calm"
        else:
            emotion = "neutral"

        self.profile["step4"] = {
            "primary_emotion": emotion,
            "energy_level": float(energy),
            "confidence": 0.75
        }
        logger.info(f" Step 4: Emotion = {emotion}")

    def create_profile(self, audio_file):
        """Run ALL 5 steps"""
        start_total = datetime.now()

        # Steps 1-4
        audio, sr = self.step1_audio_preprocessing(audio_file)
        self.step2_speaking_patterns(audio, sr)
        self.step3_pitch_analysis(audio, sr)
        self.step4_emotion_detection(audio, sr)

        # Profile metadata
        self.profile["user_id"] = "candidate_001"
        self.profile["created"] = datetime.now().isoformat()
        self.profile["profile_version"] = "1.0"

        # Save JSON
        profile_path = "personalized_voice_profile.json"
        with open(profile_path, "w") as f:
            json.dump(self.profile, f, indent=2)

        total_time = (datetime.now() - start_total).total_seconds()
        logger.info(f" FULL PERSONALIZATION PROFILE CREATED!")
        logger.info(f"  Total time: {total_time:.2f}s")
        logger.info(f" Profile saved: {profile_path}")

        return profile_path

    def personalized_synthesis(self, text, emotion="neutral", output_file="personalized_output.wav"):
        """Step 5: Personalized TTS"""
        logger.info(f" Step 5: Synthesizing: '{text}' (emotion: {emotion})")

        # Piper command
        cmd = f'echo "{text}" | ./piper/piper --model en_US-amy-medium.onnx --output_file {output_file}'
        result = subprocess.run(cmd, shell=True, cwd=self.piper_path, capture_output=True)

        if result.returncode == 0:
            logger.info(f" Personalized audio saved: {output_file}")
            return output_file
        else:
            logger.error(f" Synthesis failed: {result.stderr.decode()}")
            return None


print(" Starting Task 2: Personalization Engine")

if 'audio_files' in locals() and audio_files:
    print(f"\n Analyzing: {audio_files[0]}")

    # Create engine and run all steps
    engine = PersonalizationEngine()
    profile_path = engine.create_profile(audio_files[0])

    print("\n" + "="*60)
    print(" TASK 2 COMPLETE - PERSONALIZATION PROFILE:")
    print("="*60)

    # Show profile
    with open(profile_path, 'r') as f:
        profile_data = json.load(f)
    print(json.dumps(profile_data, indent=2))

    # Test personalized synthesis
    print("\n Testing personalized synthesis...")
    test_text = "Hello, this is my personalized voice from the Piper TTS personalization engine."
    wav_file = engine.personalized_synthesis(test_text, "neutral")

    if wav_file and os.path.exists(wav_file):
        display(Audio(wav_file))

        # Download deliverables
        print("\n DOWNLOADING FILES:")
        files.download(profile_path)
        files.download(wav_file)
        print(" Download complete!")

        # Save logs
        with open("task2_logs.txt", "w") as f:
            f.write("TASK 2 LOGS - Personalization Engine\n")
            f.write("="*50 + "\n\n")
            for handler in logger.handlers:
                f.write(str(handler))
        files.download("task2_logs.txt")

    else:
        print(" Audio synthesis failed")

else:
    print(" ERROR: No audio file uploaded!")
    print(" Run Cell 4 first to upload your voice recording")

 Starting Task 2: Personalization Engine

 Analyzing: SelfVoice.wav

 TASK 2 COMPLETE - PERSONALIZATION PROFILE:
{
  "step1": {
    "duration_seconds": 74.4107029478458,
    "sample_rate_hz": 22050,
    "rms_energy": 0.11449708789587021,
    "silence_ratio": 0.4916617705496734,
    "estimated_wpm": 66.08396982854245,
    "preprocessing": [
      "resampled_22kHz",
      "mono"
    ]
  },
  "step2": {
    "pause_count": 0,
    "avg_pause_ms": 300.0,
    "speaking_rate_wps": 1.1013994971423742
  },
  "step3": {
    "mean_freq_hz": 1485.4604057823353,
    "freq_range_hz": 1015.754694045101,
    "pitch_estimate": "medium"
  },
  "step4": {
    "primary_emotion": "neutral",
    "energy_level": 0.07986021786928177,
    "confidence": 0.75
  },
  "user_id": "candidate_001",
  "created": "2025-12-08T17:01:12.069220",
  "profile_version": "1.0"
}

 Testing personalized synthesis...



 DOWNLOADING FILES:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 Download complete!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>