In [None]:
# CELL 1 — runtime check (no installs on server)
import warnings
warnings.filterwarnings("ignore")

import sys, os, time
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import ddsp, ddsp.training
import librosa, librosa.display
import soundfile as sf

# Quiet TF info/warnings if you like:
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")

print("Python:", sys.version.split()[0])
print("TF:", tf.__version__, "| DDSP:", ddsp.__version__)
print("librosa:", librosa.__version__)

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.


ModuleNotFoundError: No module named 'ddsp'

In [None]:
# CELL 2 #
# ---- Light replacements for colab_utils ----
DEFAULT_SAMPLE_RATE = 16000

def play(y, sr=DEFAULT_SAMPLE_RATE):
    """Inline audio player for Jupyter (no google.colab)."""
    from IPython.display import Audio, display
    if y.ndim == 2 and y.shape[0] == 1:
        y = y[0]
    display(Audio(y, rate=sr))

def specplot(audio, sr=DEFAULT_SAMPLE_RATE, title=None):
    """Log-magnitude spectrogram plot."""
    if audio.ndim == 2 and audio.shape[0] == 1:
        audio = audio[0]
    S = np.abs(librosa.stft(audio, n_fft=1024, hop_length=256))**2
    Sdb = librosa.power_to_db(S, ref=np.max)
    plt.figure(figsize=(8, 3))
    librosa.display.specshow(Sdb, sr=sr, hop_length=256, x_axis='time', y_axis='log')
    plt.colorbar(format="%+2.0f dB")
    if title: plt.title(title)
    plt.tight_layout()
    plt.show()

def load_audio(path, sr=DEFAULT_SAMPLE_RATE):
    """Load mono audio to [1, samples] float32."""
    y, _sr = librosa.load(path, sr=sr, mono=True)
    return y.astype(np.float32)[None, :]

def reset_crepe():
    ddsp.spectral_ops.reset_crepe()

# Optional: quiet TensorFlow GPU warnings if you’re CPU-only
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


In [None]:
# CELL 3 #
# =================
AUDIO_PATH = "./audio/my_voice_3s.wav"
# =================

# Load audio
audio = load_audio(AUDIO_PATH, sr=DEFAULT_SAMPLE_RATE)

print("Extracting audio features...")
reset_crepe()
t0 = time.time()
audio_features = ddsp.training.metrics.compute_audio_features(audio)
audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features_mod = None
print(f"Audio features took {time.time()-t0:.1f} s")

# Quick preview
specplot(audio, title="Input Audio")
play(audio)


In [None]:
# CELL 4 #
import os, time, pickle, gin
import ddsp, ddsp.training
import numpy as np
import tensorflow.compat.v2 as tf

assert 'audio' in globals(), "Run your audio loading cell first."
assert 'audio_features' in globals(), "Compute audio_features first."

def find_gin_file(model_dir):
    cands = [f for f in os.listdir(model_dir) if f.endswith(".gin")]
    if not cands:
        raise FileNotFoundError(f"No .gin found in {model_dir}")
    cands.sort()
    return os.path.join(model_dir, cands[0])

def find_ckpt_prefix(model_dir):
    idx_files = [f for f in os.listdir(model_dir) if f.startswith("ckpt-") and f.endswith(".index")]
    if not idx_files:
        raise FileNotFoundError(f"No checkpoint .index file in {model_dir}")
    def step_of(n):
        try: return int(n.split("-")[1].split(".")[0])
        except: return -1
    idx_files.sort(key=step_of)
    latest = idx_files[-1]
    return os.path.join(model_dir, latest.rsplit(".index", 1)[0])

gin_file = find_gin_file(MODEL_DIR)
ckpt = find_ckpt_prefix(MODEL_DIR)
print("gin:", gin_file)
print("ckpt:", ckpt)

# Optional dataset stats
DATASET_STATS = None
ds_stats_fp = os.path.join(MODEL_DIR, "dataset_statistics.pkl")
if tf.io.gfile.exists(ds_stats_fp):
    try:
        with tf.io.gfile.GFile(ds_stats_fp, "rb") as f:
            DATASET_STATS = pickle.load(f)
        print("Loaded dataset_statistics.pkl")
    except Exception as e:
        print("Warn: dataset stats load failed:", e)

# Parse gin and align dimensions
with gin.unlock_config():
    gin.parse_config_file(gin_file, skip_unknown=True)

time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')
n_samples_train  = gin.query_parameter('Harmonic.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

with gin.unlock_config():
    gin.parse_config([
        f'Harmonic.n_samples = {n_samples}',
        f'FilteredNoise.n_samples = {n_samples}',
        f'F0LoudnessPreprocessor.time_steps = {time_steps}',
        'oscillator_bank.use_angular_cumsum = True',
    ])

# Trim features to match shapes
for k in ['f0_hz', 'f0_confidence', 'loudness_db']:
    audio_features[k] = audio_features[k][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]

# Restore and build
model = ddsp.training.models.Autoencoder()
print("Restoring…")
t0 = time.time()
model.restore(ckpt)
_ = model(audio_features, training=False)
print(f"Model ready in {time.time()-t0:.1f}s")


In [None]:
# CELL 5 #
# ==== Cell 5: Load DDSP model (server-safe, no google.colab) ====
import os, pickle, time, gin
import numpy as np
import ddsp, ddsp.training
import tensorflow.compat.v2 as tf

# We are NOT in Colab UI here, so define the constant directly
DEFAULT_SAMPLE_RATE = 16000

# Safety: ensure 'audio' & 'audio_features' from Cell 3 exist
assert 'audio' in globals(), "Audio not loaded. Run Cell 3 first."
assert 'audio_features' in globals(), "Audio features not computed. Run Cell 3 first."

# 1) Pick the model directory: prefer your /violin_ae; otherwise unzip the official zip once.
MODEL_DIR_CANDIDATES = []

if os.path.isdir("./models/violin_ae"):
    MODEL_DIR_CANDIDATES.append("./models/violin_ae")

# If you uploaded the official zip, we can unzip (idempotent) to /ddsp_models/solo_violin_ckpt
if os.path.exists("./models/solo_violin_ckpt.zip"):
    os.makedirs("./models", exist_ok=True)
    # Unzip only if target dir doesn't already exist
    if not os.path.isdir("./models/solo_violin_ckpt"):
        import zipfile
        with zipfile.ZipFile("./models/solo_violin_ckpt.zip", "r") as zf:
            zf.extractall("./models")
    MODEL_DIR_CANDIDATES.append("./models/solo_violin_ckpt")

# Choose the first directory that looks valid
def has_minimum_files(d):
    if not os.path.isdir(d):
        return False
    names = set(os.listdir(d))
    need = {"operative_config-0.gin"}  # any .gin is fine; we search below anyway
    has_gin = any(n.endswith(".gin") for n in names)
    has_ckpt = any(n.startswith("ckpt-") and n.endswith(".index") for n in names)
    has_ckpt_data = any(n.startswith("ckpt-") and n.endswith(".data-00000-of-00001") for n in names)
    return has_gin and has_ckpt and has_ckpt_data

MODEL_DIR = None
for d in MODEL_DIR_CANDIDATES:
    if has_minimum_files(d):
        MODEL_DIR = d
        break

if MODEL_DIR is None:
    # Helpful diagnostics
    raise FileNotFoundError(
        "Could not find a valid DDSP checkpoint directory.\n"
        "Expected either ./models/violin_ae or ./models/solo_violin_ckpt (unzipped) "
        "to contain: operative_config-*.gin, ckpt-XXXX.index, ckpt-XXXX.data-00000-of-00001\n"
        "Tip: verify with: !ls -la ./models/violin_ae && echo --- && !ls -la ./models"
    )

print(f"Using model directory: {MODEL_DIR}")

# 2) Locate gin file and checkpoint prefix
def find_gin_file(model_dir):
    # prefer 'operative_config-0.gin' but accept any .gin
    cands = [f for f in os.listdir(model_dir) if f.endswith(".gin")]
    if not cands:
        raise FileNotFoundError(f"No .gin config found in {model_dir}.")
    # stable pick
    cands.sort()
    return os.path.join(model_dir, cands[0])

def find_ckpt_prefix(model_dir):
    idx_files = [f for f in os.listdir(model_dir) if f.startswith("ckpt-") and f.endswith(".index")]
    if not idx_files:
        raise FileNotFoundError(f"No checkpoint index file found in {model_dir}.")
    # pick the largest step if multiple
    def step_of(name):
        try:
            return int(name.split("-")[1].split(".")[0])
        except:
            return -1
    idx_files.sort(key=step_of)
    latest = idx_files[-1]
    base = latest.rsplit(".index", 1)[0]
    return os.path.join(model_dir, base)

gin_file = find_gin_file(MODEL_DIR)
ckpt = find_ckpt_prefix(MODEL_DIR)
print(f"Found gin:   {gin_file}")
print(f"Found ckpt:  {ckpt}")

# 3) Load dataset statistics if present (optional but improves auto-adjust later)
DATASET_STATS = None
ds_stats_fp = os.path.join(MODEL_DIR, "dataset_statistics.pkl")
if tf.io.gfile.exists(ds_stats_fp):
    try:
        with tf.io.gfile.GFile(ds_stats_fp, "rb") as f:
            DATASET_STATS = pickle.load(f)
        print("Loaded dataset statistics.")
    except Exception as e:
        print(f"Warning: failed to load dataset_statistics.pkl: {e}")

# 4) Parse gin and align dimensions with our current audio length
with gin.unlock_config():
    gin.parse_config_file(gin_file, skip_unknown=True)

time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')
n_samples_train  = gin.query_parameter('Harmonic.n_samples')
hop_size = int(n_samples_train / time_steps_train)

# Derive the length we need for this audio
time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

gin_overrides = [
    f'Harmonic.n_samples = {n_samples}',
    f'FilteredNoise.n_samples = {n_samples}',
    f'F0LoudnessPreprocessor.time_steps = {time_steps}',
    'oscillator_bank.use_angular_cumsum = True',  # numerical stability
]
with gin.unlock_config():
    gin.parse_config(gin_overrides)

# 5) Trim features to match the new shapes
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
    audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]

# 6) Restore model and do a dry forward to build it
model = ddsp.training.models.Autoencoder()
print("Restoring model…")
t0 = time.time()
model.restore(ckpt)
_ = model(audio_features, training=False)  # build graph
print(f"Model restored and built in {time.time() - t0:.1f}s")
print("Ready for the next cell (resynthesis).")


In [None]:
# CELL 6 #
# --- Safe helpers (no google.colab) ---
import numpy as np, librosa, matplotlib.pyplot as plt
from IPython.display import Audio, display

DEFAULT_SAMPLE_RATE = 16000

def play(arr_2d_or_1d, sr=DEFAULT_SAMPLE_RATE):
    """Play [1, N] or [N] array audio safely."""
    y = arr_2d_or_1d[0] if isinstance(arr_2d_or_1d, np.ndarray) and arr_2d_or_1d.ndim == 2 else arr_2d_or_1d
    display(Audio(y, rate=sr))

def specplot(x_2d_or_1d, title=None, sr=DEFAULT_SAMPLE_RATE):
    """Simple mel-spectrogram plot (no colab_utils)."""
    y = x_2d_or_1d[0] if isinstance(x_2d_or_1d, np.ndarray) and x_2d_or_1d.ndim == 2 else x_2d_or_1d
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80, fmax=sr//2)
    Sdb = librosa.power_to_db(S, ref=np.max)
    plt.figure(figsize=(8,3))
    librosa.display.specshow(Sdb, sr=sr, x_axis='time', y_axis='mel')
    if title: plt.title(title)
    plt.colorbar(format="%+0.1f dB")
    plt.tight_layout()
    plt.show()

# Optional autotune: try to import, else fall back to pass-through
try:
    from ddsp.colab.colab_utils import auto_tune as _at, get_tuning_factor as _gtf  # may fail if google.colab is required
    _HAVE_AT = True
except Exception:
    _HAVE_AT = False
    def _at(f0_midi, tuning_factor, mask_on, amount=0.0):
        return f0_midi  # no-op
    def _gtf(f0_midi, f0_confidence, mask_on):
        return 1.0

from ddsp.training.postprocessing import detect_notes, fit_quantile_transform
import ddsp

# --- Params you can tweak ---
threshold = 1.0      # note detection strength
ADJUST = True
quiet = 20           # reduce note-off loudness
autotune_amt = 0.0   # 0..1; keep 0.0 unless _HAVE_AT is True
pitch_shift_oct = 0  # integer octaves
loudness_shift_db = 0

# --- Work off a copy ---
audio_features_mod = {k: v.copy() for k, v in audio_features.items()}

def shift_ld(feats, ld_shift=0.0):
    feats['loudness_db'] += ld_shift
    return feats

def shift_f0(feats, pitch_shift=0.0):
    feats['f0_hz'] *= (2.0 ** pitch_shift)
    feats['f0_hz'] = np.clip(feats['f0_hz'], 0.0, librosa.midi_to_hz(110.0))
    return feats

mask_on = None
if ADJUST and (DATASET_STATS is not None):
    mask_on, note_on_value = detect_notes(
        audio_features['loudness_db'],
        audio_features['f0_confidence'],
        threshold
    )
    if np.any(mask_on):
        # Match register
        target_mean_pitch = DATASET_STATS['mean_pitch']
        pitch_midi = ddsp.core.hz_to_midi(audio_features['f0_hz'])
        mean_pitch = np.mean(pitch_midi[mask_on])
        p_diff = target_mean_pitch - mean_pitch
        p_diff_oct = p_diff / 12.0
        p_diff_oct = np.floor(p_diff_oct) if p_diff_oct > 1.5 else np.ceil(p_diff_oct)
        audio_features_mod = shift_f0(audio_features_mod, p_diff_oct)

        # Loudness quantile normalize (note-on / note-off handling)
        _, loudness_norm = fit_quantile_transform(
            audio_features['loudness_db'],
            mask_on,
            inv_quantile=DATASET_STATS['quantile_transform'])
        mask_off = np.logical_not(mask_on)
        loudness_norm[mask_off] -= quiet * (1.0 - note_on_value[mask_off][:, np.newaxis])
        loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape)
        audio_features_mod['loudness_db'] = loudness_norm

        # Optional autotune
        if autotune_amt and _HAVE_AT:
            f0_midi = np.array(ddsp.core.hz_to_midi(audio_features_mod['f0_hz']))
            tuning_factor = _gtf(f0_midi, audio_features_mod['f0_confidence'], mask_on)
            f0_midi_at = _at(f0_midi, tuning_factor, mask_on, amount=autotune_amt)
            audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at)
    else:
        print("Skipping auto-adjust (no notes detected).")
else:
    print("Skipping auto-adjust (disabled or no dataset stats).")

# Manual tweaks
audio_features_mod = shift_ld(audio_features_mod, loudness_shift_db)
audio_features_mod = shift_f0(audio_features_mod, pitch_shift_oct)

# Quick comparison plots
TRIM = -15
plt.figure(figsize=(8,3))
plt.plot(audio_features['loudness_db'][:TRIM], label="orig")
plt.plot(audio_features_mod['loudness_db'][:TRIM], label="mod")
plt.legend(); plt.title("loudness_db"); plt.tight_layout(); plt.show()

plt.figure(figsize=(8,3))
plt.plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]), label="orig")
plt.plot(librosa.hz_to_midi(audio_features_mod['f0_hz'][:TRIM]), label="mod")
plt.legend(); plt.title("f0 [midi]"); plt.tight_layout(); plt.show()


In [None]:
# CELL 7 #
import numpy as np, librosa, matplotlib.pyplot as plt
from IPython.display import Audio, display
import tensorflow as tf
import librosa.display as ldisplay  # <-- import once, avoid shadowing

DEFAULT_SAMPLE_RATE = 16000

def _to_np1d(x):
    if isinstance(x, (list, tuple)):
        x = x[0]
    if tf.is_tensor(x):
        x = x.numpy()
    x = np.asarray(x)
    if x.ndim == 2 and x.shape[0] == 1:
        x = x[0]
    if x.ndim != 1:
        raise ValueError(f"Expected 1D audio, got shape {x.shape}")
    return x.astype(np.float32)

def play(arr, sr=DEFAULT_SAMPLE_RATE):
    y = _to_np1d(arr)
    display(Audio(y, rate=sr))

def specplot(arr, title=None, sr=DEFAULT_SAMPLE_RATE):
    y = _to_np1d(arr)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80, fmax=sr//2)
    Sdb = librosa.power_to_db(S, ref=np.max)
    plt.figure(figsize=(8,3))
    ldisplay.specshow(Sdb, sr=sr, x_axis='time', y_axis='mel')  # use alias
    if title: plt.title(title)
    plt.colorbar(format="%+0.1f dB")
    plt.tight_layout()
    plt.show()


In [None]:
# CELL 8 #
print("Original:")
play(audio, sr=DEFAULT_SAMPLE_RATE)
print("Resynthesis:")
play(audio_gen, sr=DEFAULT_SAMPLE_RATE)

specplot(audio, title="Original", sr=DEFAULT_SAMPLE_RATE)
specplot(audio_gen, title="Resynthesis", sr=DEFAULT_SAMPLE_RATE)

import soundfile as sf
OUT_WAV = "./resynthesis_violin.wav"
sf.write(OUT_WAV, _to_np1d(audio_gen), DEFAULT_SAMPLE_RATE)
print("Saved:", OUT_WAV)
