In [None]:
# New Code to run in VSCode
# Step 1: Remove old SadTalker if it exists
import os
import shutil

if os.path.exists("SadTalker"):
    print("Removing existing SadTalker folder...")
    shutil.rmtree("SadTalker")

# Step 2: Clone SadTalker repo
print("Cloning SadTalker repo...")
!git clone https://github.com/OpenTalker/SadTalker.git

# Step 3: Move into SadTalker folder
print("Moving into SadTalker directory...")
%cd SadTalker

# Step 4: Modify requirements.txt using Python (since 'sed' doesn't work on Windows)
print("Modifying requirements.txt...")
with open("requirements.txt", "r") as f:
    lines = f.readlines()
with open("requirements.txt", "w") as f:
    for line in lines:
        if not any(x in line for x in ["torch==1.12.1", "torchvision==0.13.1", "torchaudio==0.12.1"]):
            f.write(line)

# Step 5: Install SadTalker requirements
print("Installing SadTalker requirements...")
%pip install -r requirements.txt

# Step 6: Install Coqui TTS (for voice synthesis)
print("Installing TTS package...")
%pip install TTS

# Step 7: Install video/audio processing tools (skip apt-get)
print("Installing video/audio processing tools...")
%pip install ffmpeg-python imageio imageio-ffmpeg
%pip install --upgrade imageio

# Step 8: Run model download script using Git Bash-compatible shell
print("Attempting to run download_models.sh (requires bash)...")
!bash scripts/download_models.sh || echo "⚠️ Run this manually in Git Bash if it fails on Windows!"

# Step 9: Go back to root if needed
%cd ..

# Step 10: Verify imports
print("Verifying imports...")
import numpy as np
import torch
import torchvision
import TTS

print("✅ All core libraries imported successfully!")

# Optional: Show installed versions
print("\nInstalled Package Versions:")
%pip show torch torchvision torchaudio numpy TTS basicsr facexlib gfpgan

In [3]:
import fileinput
import sys

file_path = './SadTalker/src/face3d/util/my_awing_arch.py'
old_string = 'preds = preds.astype(np.float, copy=False)'
new_string = 'preds = preds.astype(float, copy=False)'  # Use built-in float

# Edit the file in place to replace deprecated np.float
for line in fileinput.FileInput(file_path, inplace=True):
    if old_string in line:
        line = line.replace(old_string, new_string)
    sys.stdout.write(line)

print(f"✅ Fixed np.float issue in: {file_path}")

✅ Fixed np.float issue in: ./SadTalker/src/face3d/util/my_awing_arch.py


In [4]:
import sys
import os
import numpy as np

file_path = './SadTalker/src/face3d/util/preprocess.py'
debug_string_identifier = "DEBUG: align_img vars -"

try:
    # Read the current content
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Check if imports are already there
    import_sys_present = any("import sys" in line for line in lines)
    import_numpy_present = any("import numpy" in line for line in lines)

    updated_lines = []

    # Add imports if missing
    if not import_sys_present:
        updated_lines.append("import sys\n")
    if not import_numpy_present:
        updated_lines.append("import numpy as np\n")

    # Modify content
    for line in lines:
        if debug_string_identifier in line:
            continue  # Remove debug line
        elif 'trans_params = np.array([w0, h0, s, t[0], t[1]])' in line:
            updated_lines.append('    trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])  # Fixed shape error\n')
        else:
            updated_lines.append(line)

    # Write back
    with open(file_path, 'w') as f:
        f.writelines(updated_lines)

    print(f"✅ Patch applied to '{file_path}' successfully!")

except FileNotFoundError:
    print(f"❌ File not found at {file_path}. Make sure SadTalker is cloned properly.")
except Exception as e:
    print(f"⚠️ Error occurred during patching: {e}")


✅ Patch applied to './SadTalker/src/face3d/util/preprocess.py' successfully!


In [5]:
import fileinput
import sys
import os
import basicsr  # Make sure basicsr is installed

# Dynamically locate basicsr's install directory
basicsr_dir = os.path.dirname(basicsr.__file__)
degradations_path = os.path.join(basicsr_dir, 'data', 'degradations.py')
problematic_import = 'from torchvision.transforms.functional_tensor import rgb_to_grayscale'

print(f"🚧 Attempting to patch '{degradations_path}' to remove the broken import...")

try:
    if not os.path.exists(degradations_path):
        print(f"❌ File not found: {degradations_path} — basicsr might be broken or incomplete.")
    else:
        for line in fileinput.FileInput(degradations_path, inplace=True):
            if problematic_import in line:
                print(f"⚠️ Removed line: {line.strip()}", file=sys.stderr)
                continue  # Skip writing the problematic line
            sys.stdout.write(line)

        print(f"✅ Successfully patched '{degradations_path}' and removed the import.")

except Exception as e:
    print(f"🔥 Error during patching: {e}")


🚧 Attempting to patch 'c:\Users\DEEPESH KUMAR K\Documents\UP_TO_SKILLS\avatar\venv\lib\site-packages\basicsr\data\degradations.py' to remove the broken import...
✅ Successfully patched 'c:\Users\DEEPESH KUMAR K\Documents\UP_TO_SKILLS\avatar\venv\lib\site-packages\basicsr\data\degradations.py' and removed the import.


In [7]:
from tkinter import Tk, filedialog
import os

# Create root window
root = Tk()
root.withdraw()  # Hide the empty main window
root.lift()      # Bring it to front
root.attributes('-topmost', True)  # Force it above all other windows

# Open file dialog
print("📷 Please select a portrait image for the avatar (JPG or PNG)")
filename = filedialog.askopenfilename(
    title="Select an avatar image",
    filetypes=[("Image files", "*.jpg *.jpeg *.png")]
)

root.destroy()  # Destroy the hidden main window after use

# Use the selected file
if filename:
    print(f"✅ Uploaded: {os.path.basename(filename)}")
else:
    print("❌ No file selected")

📷 Please select a portrait image for the avatar (JPG or PNG)
✅ Uploaded: amitha_bachan.jpg


In [8]:
lesson_text = """ Hello everyone, this is Deepeshkumar ’ s AI-powered learning avatar. In this session, I’ll walk you through an Introduction to Python — the most beginner-friendly programming language for data science , automation and Artificial Intelligence. """

In [4]:
from pydub import AudioSegment

# === Input & Output File Paths ===
input_path = "mohanlal_voice.mp3"      # Replace with your file
output_path = "clean_voice.wav"

# === Load and Convert ===
audio = AudioSegment.from_file(input_path)

# Convert to mono and set frame rate (sample rate)
audio = audio.set_channels(1).set_frame_rate(16000)

# Export the cleaned version
audio.export(output_path, format="wav")

print("✅ Conversion complete: 16kHz Mono WAV saved.")


✅ Conversion complete: 16kHz Mono WAV saved.


In [9]:
from TTS.api import TTS

# Load multilingual TTS model
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)

# Your voice sample (ensure it's clean and 16kHz mono)
speaker_wav = "amita_bhachan.mp3"

# Your input script
text_to_speak = lesson_text

# Output speech audio file
output_wav = "amitha_bhachan.wav"

# 🔥 FIX: Add language='en' explicitly
tts.tts_to_file(
    text=text_to_speak,
    speaker_wav=speaker_wav,
    language="en",  # ← required for multilingual models
    file_path=output_wav
)


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

'amitha_bhachan.wav'

In [10]:
import os
import shutil
from pydub import AudioSegment
import psutil
print(f"Available memory: {psutil.virtual_memory().available / 1e6} MB")


# 📷 Automatically find the uploaded avatar image file (jpg/png/jpeg)
avatar_image = filename

# 🔉 Path to the generated lesson audio
audio_path = output_wav  # Must be defined from the previous TTS step

# 📁 Create input/output folders if not already there
# os.makedirs('input', exist_ok=True)
# os.makedirs('output', exist_ok=True)


# 🗂️ Move files into input folder for SadTalker
shutil.copy(avatar_image, 'input/amitha_bachan.jpg')
shutil.copy(audio_path, 'input/amitha_bhachan.wav')

# 📍 Make sure you're inside SadTalker directory before running inference
current_dir = os.getcwd()
if os.path.basename(current_dir) != 'SadTalker':
    %cd SadTalker

# 🚀 Run SadTalker to generate the talking avatar
# Added --size 256 and removed --enhancer gfpgan for faster processing
print("🔥 Running SadTalker inference with faster settings...")
!python inference.py \
--driven_audio ../input/amitha_bhachan.wav \
--source_image ../input/amitha_bachan.jpg \
--result_dir ../output \
--still \
--preprocess crop \
--size 256 \
--batch_size 1 \
--expression_scale 0.8 \
--input_yaw 0 --input_pitch 0 --input_roll 0


# ⬅️ Go back to root directory after running the model
if os.path.basename(os.getcwd()) == 'SadTalker':
    %cd ..

Available memory: 1516.6464 MB
c:\Users\DEEPESH KUMAR K\Documents\UP_TO_SKILLS\avatar\SadTalker
🔥 Running SadTalker inference with faster settings...


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


using safetensor as defaultc:\Users\DEEPESH KUMAR K\Documents\UP_TO_SKILLS\avatar



landmark Det::   0%|          | 0/1 [00:00<?, ?it/s]
landmark Det:: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
landmark Det:: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]

3DMM Extraction In Video::   0%|          | 0/1 [00:00<?, ?it/s]
3DMM Extraction In Video:: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
3DMM Extraction In Video:: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s]

mel::   0%|          | 0/402 [00:00<?, ?it/s]
mel:: 100%|██████████| 402/402 [00:00<?, ?it/s]

audio2exp::   0%|          | 0/41 [00:00<?, ?it/s]
audio2exp::  15%|█▍        | 6/41 [00:00<00:00, 52.00it/s]
audio2exp::  32%|███▏      | 13/41 [00:00<00:00, 56.77it/s]
audio2exp::  46%|████▋     | 19/41 [00:00<00:00, 54.14it/s]
audio2exp::  63%|██████▎   | 26/41 [00:00<00:00, 56.48it/s]
audio2exp::  78%|███████▊  | 32/41 [00:00<00:00, 54.47it/s]
audio2exp::  98%|█████████▊| 40/41 [00:00<00:00, 56.70it/s]
audio2exp:: 100%|██████████| 41/41 [00:00<00:00, 56.03it/s]

Face Renderer::   0%|          | 0/402 [00:00


3DMM Extraction for source image
The generated video is named ../output\2025_07_30_12.58.41/amitha_bachan##amitha_bhachan.mp4
The generated video is named: ../output\2025_07_30_12.58.41.mp4


In [12]:
from IPython.display import Video
import os

# 📂 Define the output directory
output_dir = 'output'

# 🎥 Find any MP4 file generated by SadTalker
mp4_files = [f for f in os.listdir(output_dir) if f.endswith('41.mp4')]

if mp4_files:
    video_file = mp4_files[0]
    video_path = f"{output_dir}/{video_file}"
    print(f"✅ Found video file: {video_path}")

    # 🎬 Display the video right in the notebook
    print("🎥 Playing generated avatar video below...")
    display(Video(video_path, embed=True, width=400))
else:
    print(f"❌ No video found in '{output_dir}' folder.")
    print("⚠️ Check if SadTalker ran successfully and generated the output.")


✅ Found video file: output/2025_07_30_12.58.41.mp4
🎥 Playing generated avatar video below...


In [None]:
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
import textwrap

# === Config ===
avatar_path = "output/2025_07_29_22.41.13.mp4"
output_path = "final_output_centered.mp4"
full_text = "Hello everyone,This session is delivered by an intelligent virtual instructor, designed using advanced AI tools.Today, I will be guiding you through an Introduction to Python Programming — a powerful and beginner-friendly language that is widely used in data science, automation, and artificial intelligence."
font_size = 40
font = "Arial-Bold"
screen_width = 1280
screen_height = 720
avatar_width = 480
text_area_width = screen_width - avatar_width
text_x_start = avatar_width + 20

# === Load avatar video ===
avatar_full = VideoFileClip(avatar_path).resize(width=avatar_width)
# Move content down to achieve true centering with equal top/bottom padding
# Calculate vertical centering for both avatar and text
avatar_y_position = (screen_height - avatar_full.h) // 2
text_y_start = (screen_height - font_size * 4) // 2  # Adjusted height for 3–4 lines of text

avatar_duration = 4.0  # Your video duration

# === Calculate word timing ===
words = full_text.split()
words_count = len(words)
word_duration = avatar_duration / words_count  # Even distribution across 10 seconds

print(f"Words: {words_count}, Duration per word: {word_duration:.2f}s")

# === Create individual text clips (each replaces the previous one) ===
text_clips = []
accumulated_text = ""

for i, word in enumerate(words):
    accumulated_text += word + " "
    
    # Wrap text to fit in the text area
    wrapped_text = textwrap.fill(accumulated_text.strip(), width=30)
    
    # Create text clip for this accumulated text
    txt_clip = TextClip(wrapped_text,
                        fontsize=font_size,
                        font=font,
                        color='white',
                        size=(text_area_width - 40, None),
                        method='caption')
    
    # Set timing: each clip starts when its word should appear
    start_time = i * word_duration
    
    # Each clip lasts until the next word appears (or until end for last word)
    if i < len(words) - 1:
        end_time = (i + 1) * word_duration
    else:
        end_time = avatar_duration
    
    txt_clip = txt_clip.set_position((text_x_start, text_y_start)).set_start(start_time).set_end(end_time)
    text_clips.append(txt_clip)

# === Create final composite ===
# Use the full 10-second avatar video
avatar_trimmed = avatar_full.subclip(0, avatar_duration).set_position(("left", avatar_y_position))


# Combine avatar with text clips (they won't overlap now)
final_video = CompositeVideoClip([avatar_trimmed] + text_clips, size=(screen_width, screen_height))

# === Write final video ===
final_video.write_videofile(output_path, codec="libx264", fps=24)

print(f"Video created successfully: {output_path}")
print(f"Duration: {final_video.duration:.2f} seconds")