## Preprocessing

- mono channel
- 48kHz

In [5]:
import os

os.listdir('../static_files/suno')[:5]

['31-Vochop.mp3',
 '41-Normal Sunset.mp3',
 '11-Let Me Go.mp3',
 '21-what u want.mp3',
 '61-poster 2 2.mp3']

In [None]:
# 1m 14.7s

import os
import librosa
import soundfile as sf

input_dir = '../static_files/suno'
output_dir = '../static_files/suno_mono'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.lower().endswith('.mp3'):
        input_path = os.path.join(input_dir, filename)
        y, sr = librosa.load(input_path, sr=None, mono=True)
        output_path = os.path.join(output_dir, filename)
        sf.write(output_path, y, sr)

In [8]:
import librosa
import IPython.display as ipd
import random

mono_dir = '../static_files/suno_mono'
mp3_files = [f for f in os.listdir(mono_dir) if f.lower().endswith('.mp3')]

if mp3_files:
    sample_file = random.choice(mp3_files)
    sample_path = os.path.join(mono_dir, sample_file)
    y, sr = librosa.load(sample_path, sr=None, mono=True)
    
    # 파일 정보 출력
    print(f"파일명: {sample_file}")
    print(f"샘플링 레이트: {sr}")
    print(f"총 샘플 수: {len(y)}")
    print(f"재생 시간(초): {len(y) / sr:.2f}")
    if y.ndim == 1:
        print("이 파일은 mono입니다.")
    elif y.ndim == 2:
        print("이 파일은 stereo입니다.")
    else:
        print(f"채널 수: {y.ndim}")
    
    # 실제로 재생
    ipd.display(ipd.Audio(sample_path))
else:
    print("mp3 파일이 존재하지 않습니다.")

파일명: 35-Ambiguous Neon.mp3
샘플링 레이트: 48000
총 샘플 수: 6566399
재생 시간(초): 136.80
이 파일은 mono입니다.


## Embedding using laion-CLAP

- using laion/larger_clap_music
- 추출 벡터를 보니 l2 norm process 자체가 내부에 있음 (l2 normalization 필요 없음)

In [None]:
# 13.3s

from transformers import ClapProcessor, ClapModel
import torch
import librosa

# laion/larger_clap_music 모델과 processor 불러오기
processor = ClapProcessor.from_pretrained("laion/larger_clap_music")
model = ClapModel.from_pretrained("laion/larger_clap_music")

# CLAP 모델의 기본 sampling rate (48kHz)
TARGET_SR = 48000

embeddings = {}
for mp3_file in mp3_files:
    file_path = os.path.join(mono_dir, mp3_file)
    
    # librosa로 오디오 로드 (48kHz, mono)
    audio_data, sr = librosa.load(file_path, sr=TARGET_SR, mono=True)
    
    # processor로 전처리 (sampling_rate 명시)
    inputs = processor(
        audios=audio_data, 
        sampling_rate=TARGET_SR, 
        return_tensors="pt"
    )
    
    with torch.no_grad():
        audio_emb = model.get_audio_features(**inputs)
    
    embeddings[mp3_file] = audio_emb.cpu().numpy()
    print(f"✓ {mp3_file} - shape: {audio_emb.shape}")

print(f"\n총 {len(embeddings)}개 파일 임베딩 완료!")


  inputs = processor(


✓ 31-Vochop.mp3 - shape: torch.Size([1, 512])
✓ 41-Normal Sunset.mp3 - shape: torch.Size([1, 512])
✓ 11-Let Me Go.mp3 - shape: torch.Size([1, 512])
✓ 21-what u want.mp3 - shape: torch.Size([1, 512])
✓ 61-poster 2 2.mp3 - shape: torch.Size([1, 512])
✓ 67-diff way.mp3 - shape: torch.Size([1, 512])
✓ 74-deep night 9.mp3 - shape: torch.Size([1, 512])
✓ 19-Wimps.mp3 - shape: torch.Size([1, 512])
✓ 85-fade dream.mp3 - shape: torch.Size([1, 512])
✓ 68-deep night 3.mp3 - shape: torch.Size([1, 512])
✓ 26-whatever.mp3 - shape: torch.Size([1, 512])
✓ 53-straight go my way ver 5.mp3 - shape: torch.Size([1, 512])
✓ 30-diriring.mp3 - shape: torch.Size([1, 512])
✓ 6-so.mp3 - shape: torch.Size([1, 512])
✓ 60-Poster 2 UK House Remix.mp3 - shape: torch.Size([1, 512])
✓ 40-String Loopy.mp3 - shape: torch.Size([1, 512])
✓ 28-seeya.mp3 - shape: torch.Size([1, 512])
✓ 39-wigle.mp3 - shape: torch.Size([1, 512])
✓ 86-puzzle in city.mp3 - shape: torch.Size([1, 512])
✓ 51-walking ver2.mp3 - shape: torch.Size([1

In [21]:
embeddings['31-Vochop.mp3'].shape

(1, 512)

In [19]:
inputs['input_features'].shape

torch.Size([1, 1, 1001, 64])

In [22]:
import numpy as np

# L2 Normalization 적용
normalized_embeddings = {}
for fname, emb in embeddings.items():
    # emb shape: (1, 512) -> squeeze해서 (512,)로 만들고 normalize
    emb_flat = emb.squeeze()  # (512,)
    l2_norm = np.linalg.norm(emb_flat)
    normalized_emb = emb_flat / l2_norm
    normalized_embeddings[fname] = normalized_emb

# 검증: L2 norm이 1인지 확인
sample_key = list(normalized_embeddings.keys())[0]
sample_norm = np.linalg.norm(normalized_embeddings[sample_key])
print(f"샘플 '{sample_key}'의 L2 norm: {sample_norm:.6f}")
print(f"임베딩 shape: {normalized_embeddings[sample_key].shape}")


샘플 '31-Vochop.mp3'의 L2 norm: 1.000000
임베딩 shape: (512,)


In [23]:
normalized_embeddings['31-Vochop.mp3']

array([-0.00677449,  0.00303425,  0.13320889, -0.04869   , -0.02569649,
        0.01157782, -0.03957724, -0.03620983, -0.03711814,  0.05846758,
        0.05416006, -0.01636868, -0.08753783,  0.01269977, -0.00771324,
       -0.05010893,  0.07501113,  0.09401571, -0.04398833, -0.01495767,
        0.0402547 ,  0.02504187,  0.03780269, -0.06393874, -0.05887164,
        0.04821058, -0.00996679,  0.00690144, -0.04429559,  0.03789781,
        0.03744894,  0.10529441, -0.06303916, -0.04891729,  0.00898471,
       -0.05593539, -0.03597819,  0.01018485, -0.04394026,  0.03854926,
        0.09427232, -0.00127043,  0.01309529,  0.01019805, -0.04769791,
       -0.02716067, -0.06406462,  0.06911797, -0.07344545, -0.04450381,
       -0.04635184,  0.00692972, -0.00756504, -0.00021641, -0.02697095,
        0.00623455, -0.01825501, -0.04850756,  0.02832927,  0.01383859,
        0.06064751, -0.02506255, -0.0016869 , -0.07165688, -0.05812662,
       -0.02703802, -0.02195805,  0.04951827, -0.05282455, -0.02

In [24]:
embeddings['31-Vochop.mp3']

array([[-0.00677449,  0.00303425,  0.13320889, -0.04869   , -0.02569649,
         0.01157782, -0.03957724, -0.03620983, -0.03711814,  0.05846758,
         0.05416006, -0.01636868, -0.08753783,  0.01269977, -0.00771324,
        -0.05010893,  0.07501113,  0.09401571, -0.04398833, -0.01495767,
         0.0402547 ,  0.02504187,  0.03780269, -0.06393874, -0.05887164,
         0.04821058, -0.00996679,  0.00690144, -0.04429559,  0.03789781,
         0.03744894,  0.10529441, -0.06303916, -0.04891729,  0.00898471,
        -0.05593539, -0.03597819,  0.01018485, -0.04394026,  0.03854926,
         0.09427232, -0.00127043,  0.01309529,  0.01019805, -0.04769791,
        -0.02716067, -0.06406462,  0.06911797, -0.07344545, -0.04450381,
        -0.04635184,  0.00692972, -0.00756504, -0.00021641, -0.02697095,
         0.00623455, -0.01825501, -0.04850756,  0.02832927,  0.01383859,
         0.06064751, -0.02506255, -0.0016869 , -0.07165688, -0.05812662,
        -0.02703802, -0.02195805,  0.04951827, -0.0

In [25]:
# L2 Normalization 전후 벡터 크기 비교
sample_key = '31-Vochop.mp3'

# 원본 임베딩
original_emb = embeddings[sample_key].squeeze()
original_l2_norm = np.linalg.norm(original_emb)

# 정규화된 임베딩
normalized_emb = normalized_embeddings[sample_key]
normalized_l2_norm = np.linalg.norm(normalized_emb)

print(f"📁 파일: {sample_key}")
print(f"{'='*40}")
print(f"🔹 원본 벡터 L2 norm:     {original_l2_norm:.6f}")
print(f"🔹 정규화 후 L2 norm:     {normalized_l2_norm:.6f}")
print(f"{'='*40}")
print(f"📐 벡터 차원: {original_emb.shape[0]}")
print(f"📉 축소 비율: {normalized_l2_norm / original_l2_norm:.6f}")


📁 파일: 31-Vochop.mp3
🔹 원본 벡터 L2 norm:     1.000000
🔹 정규화 후 L2 norm:     1.000000
📐 벡터 차원: 512
📉 축소 비율: 1.000000


## Write Embeddings to supabase

In [29]:
# 키를 id만 남기고, embedding을 Supabase에 저장하기 쉬운 형태(list)로 변환
supabase_data = []

for filename, emb in embeddings.items():
    # "31-Vochop.mp3" -> "31"
    music_id = filename.split('-')[0]
    
    # numpy array (1, 512) -> list (512,)
    embedding_list = emb.squeeze().tolist()
    
    supabase_data.append({
        "id": int(music_id),
        "embedding": embedding_list
    })

# 정렬 (id 기준)
supabase_data.sort(key=lambda x: x["id"])

# 결과 확인
print(f"총 {len(supabase_data)}개 데이터 변환 완료!")
print(f"\n샘플 데이터:")
print(f"  - id: {supabase_data[0]['id']}")
print(f"  - embedding 타입: {type(supabase_data[0]['embedding'])}")
print(f"  - embedding 길이: {len(supabase_data[0]['embedding'])}")
print(f"  - embedding[:5]: {supabase_data[0]['embedding'][:5]}")


총 84개 데이터 변환 완료!

샘플 데이터:
  - id: 3
  - embedding 타입: <class 'list'>
  - embedding 길이: 512
  - embedding[:5]: [-0.007643065880984068, 0.0025495439767837524, 0.12741483747959137, -0.03734913840889931, -0.02148122899234295]


In [30]:
supabase_data[0]

{'id': 3,
 'embedding': [-0.007643065880984068,
  0.0025495439767837524,
  0.12741483747959137,
  -0.03734913840889931,
  -0.02148122899234295,
  0.01883510872721672,
  -0.035042911767959595,
  -0.0370391346514225,
  -0.03975120186805725,
  0.07134031504392624,
  0.05242178589105606,
  -0.014452211558818817,
  -0.09894503653049469,
  0.03277325630187988,
  0.01660280115902424,
  -0.053961288183927536,
  0.06681770086288452,
  0.10210933536291122,
  -0.03371640294790268,
  -0.0314035601913929,
  0.039504047483205795,
  0.032498445361852646,
  0.05458233132958412,
  -0.07313213497400284,
  -0.06302756071090698,
  0.0570218451321125,
  -0.010399495251476765,
  -0.0020528703462332487,
  -0.04695774242281914,
  0.020924076437950134,
  0.04061109200119972,
  0.09715086221694946,
  -0.04617885500192642,
  -0.03415021672844887,
  -0.008046616800129414,
  -0.062143485993146896,
  -0.04601151868700981,
  0.023562749847769737,
  -0.025317568331956863,
  0.03745963051915169,
  0.0952795073390007,
