In [2]:
!pip install qdrant-client
!pip install sentence-transformers tqdm
## 1. Import các thư viện cần thiết

import pandas as pd
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest

## 2. Đọc dữ liệu từ JSON
DESCRIPTIONS_PATH = "/kaggle/input/datasong/songs_details.json"
DETAILS_PATH      = "/kaggle/input/datasong/songs_dedup.json"

with open(DESCRIPTIONS_PATH, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

with open(DETAILS_PATH, "r", encoding="utf-8") as f:
    details = json.load(f)

print("Loaded:", len(descriptions), "descriptions and", len(details), "details")

## 3. Ghép descriptions với metadata theo 'name'
details_map = {d["name"].strip().lower(): d for d in details}

records = []
for i, d in enumerate(descriptions):
    name = d.get("name", "").strip()
    meta = details_map.get(name.lower(), {})  # lookup an toàn

    rec = {
        "id": i + 1,
        "name": name,
        "description": d.get("description", ""),
        "metadata": {
            "artist": ", ".join(meta.get("artist", [])) if isinstance(meta.get("artist"), list) else meta.get("artist"),
            "release_date": meta.get("release_date"),
            "duration": meta.get("duration"),
            "tempo": meta.get("tempo"),
            "valence": meta.get("valence"),
            "danceability": meta.get("danceability"),
            "producers": ", ".join(meta.get("producers", [])) if isinstance(meta.get("producers"), list) else meta.get("producers"),
            "lyrics": meta.get("lyrics"),
            "tags": meta.get("tags", []),
            "summary": meta.get("summary"),
            "emotion": meta.get("emotion"),
            "context": meta.get("context")
        }
    }
    records.append(rec)

print("DEBUG first record:", records[0])

## 4. Tạo embedding cho từng bài hát
model = SentenceTransformer("all-MiniLM-L6-v2")

batch_size = 64
for i in range(0, len(records), batch_size):
    batch_texts = [r["description"] for r in records[i:i+batch_size]]
    embeddings = model.encode(batch_texts, show_progress_bar=False, normalize_embeddings=True)
    for j, emb in enumerate(embeddings):
        records[i + j]["vector"] = emb.tolist()

print("Done embeddings. Example dim:", len(records[0]["vector"]))

## 5. Kết nối tới Qdrant Cloud
QDRANT_URL = "https://a47da1cb-9f4e-42f6-9de8-3df23de9d559.europe-west3-0.gcp.cloud.qdrant.io"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CcU6del6y9yNd-D60YeLZeCS3Ek_LTq-lrCBXdUtb2M"

client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

## 6. Tạo collection và upload dữ liệu
collection_name = "songs"
vector_size = len(records[0]["vector"])
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=rest.VectorParams(size=vector_size, distance=rest.Distance.COSINE),
)
print(f"Collection '{collection_name}' created with vector size = {vector_size}")

batch_size_upsert = 64
points_batch = []
for r in tqdm(records, desc="Preparing points"):
    meta = r["metadata"]

    payload = {
        "name": r["name"],
        "description": r["description"],
        "artist": meta.get("artist"),
        "tags": meta.get("tags") if isinstance(meta.get("tags"), list) else [meta.get("tags")] if meta.get("tags") else [],
        "release_date": meta.get("release_date"),
        "duration": meta.get("duration"),
        "tempo": meta.get("tempo"),
        "valence": meta.get("valence"),
        "danceability": meta.get("danceability"),
        "producers": meta.get("producers"),
        "lyrics": meta.get("lyrics"),
        "summary": meta.get("summary"),
        "emotion": meta.get("emotion"),
        "context": meta.get("context"),
    }

    pt = rest.PointStruct(
        id=r["id"],
        vector=r["vector"],
        payload=payload
    )
    points_batch.append(pt)

    if len(points_batch) >= batch_size_upsert:
        client.upsert(collection_name=collection_name, points=points_batch)
        points_batch = []

if points_batch:
    client.upsert(collection_name=collection_name, points=points_batch)

print("Upload finished. Total points:", len(records))

## 7. Tạo index cho các trường filter (artist, tags, emotion, context)
for field in ["artist", "tags", "emotion", "context"]:
    client.create_payload_index(
        collection_name=collection_name,
        field_name=field,
        field_schema=rest.PayloadSchemaType.KEYWORD
    )
print("Indexes created for artist, tags, emotion, context.")

## 8. Người dùng nhập input và hiển thị kết quả
query_text = input("Nhập mô tả bài hát (ví dụ: 'romantic piano music'): ").strip()
query_vector = model.encode([query_text], normalize_embeddings=True)[0].tolist()

artist_filter = input("Nhập artist cần filter (Enter nếu bỏ qua): ").strip()
tags_filter = input("Nhập tags (cách nhau bằng dấu phẩy, Enter nếu bỏ qua): ").strip()

must_conditions = []
if artist_filter:
    must_conditions.append(
        rest.FieldCondition(
            key="artist",
            match=rest.MatchValue(value=artist_filter)
        )
    )
if tags_filter:
    tags_list = [t.strip() for t in tags_filter.split(",") if t.strip()]
    if tags_list:
        must_conditions.append(
            rest.FieldCondition(
                key="tags",
                match=rest.MatchAny(any=tags_list)
            )
        )

res = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=5,
    query_filter=rest.Filter(must=must_conditions) if must_conditions else None,
    with_payload=True
)

# Chuyển kết quả sang DataFrame để hiển thị đẹp
data = []
for r in res.points:
    data.append({
        "Name": r.payload.get("name"),
        "Artist": r.payload.get("artist"),
        "Tags": ", ".join(r.payload.get("tags", [])),
        "Release": r.payload.get("release_date"),
        "Emotion": r.payload.get("emotion"),
        "Score": round(r.score, 4)
    })

df = pd.DataFrame(data)
print(df)



# ================================
# MOCK TEST 3 – KAGGLE PIPELINE LOGIC TEST
# ================================

print("=== MOCK TEST 3: KAGGLE VECTOR PIPELINE ===")

# LẤY 1 POINT TỪ QDRANT (KHÔNG DÙNG records)
scroll = client.scroll(
    collection_name=collection_name,
    limit=1,
    with_payload=True,
    with_vectors=True
)

assert len(scroll[0]) > 0
point = scroll[0][0]

print("\n[1] Loaded point from Qdrant")
print("Point ID:", point.id)

# EMBEDDING LOGIC
sample_desc = point.payload.get("description", "")
vec = model.encode([sample_desc], normalize_embeddings=True)[0]

print("\n[2] Embedding Test")
print("Vector dim:", len(vec))
print("Has NaN:", any(v != v for v in vec))

assert len(vec) == len(point.vector)
assert not any(v != v for v in vec)

# QDRANT PAYLOAD STRUCTURE
print("\n[3] Payload Structure Test")
print("Payload keys:", point.payload.keys())

assert "name" in point.payload
assert "description" in point.payload

# SEARCH FLOW
print("\n[4] Search Flow Test")
query = "sad emotional song"
q_vec = model.encode([query], normalize_embeddings=True)[0].tolist()

res = client.query_points(
    collection_name=collection_name,
    query=q_vec,
    limit=3,
    with_payload=True
)

print("Returned results:", len(res.points))
print("Sample score:", res.points[0].score)

assert len(res.points) > 0
assert hasattr(res.points[0], "score")

# FILTER LOGIC
print("\n[5] Filter Logic Test")
res_filter = client.query_points(
    collection_name=collection_name,
    query=q_vec,
    limit=3,
    query_filter=rest.Filter(
        must=[
            rest.FieldCondition(
                key="emotion",
                match=rest.MatchValue(value="melancholic")
            )
        ]
    ),
    with_payload=True
)

print("Filtered result count:", len(res_filter.points))
assert res_filter.points is not None

print("\nMOCK TEST 3 PASSED – KAGGLE PIPELINE OK")


Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.16.2-py3-none-any.whl (377 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.16.2
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-

2026-02-01 09:23:28.077084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769937808.446507      38 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769937808.542352      38 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loaded: 9665 descriptions and 9665 details
DEBUG first record: {'id': 1, 'name': 'VOY A LLeVARTE PA PR', 'description': "'VOY A LLeVARTE PA PR' is a reggaeton, perreo, latin urban, party track around 96 BPM. It feels enthusiasm, fiery, party. A lively and energetic reggaeton song, Bad Bunny invites listeners to experience the party atmosphere and perreo in Puerto Rico. 'VOY A LLeVARTE PA PR' is an invitation to experience the cultural party and reggaeton typical of Puerto Rico, with catchy beats and dance floor energy.", 'metadata': {'artist': 'Bad Bunny', 'release_date': None, 'duration': 204000, 'tempo': 105, 'valence': 0.5, 'danceability': 0.638, 'producers': None, 'lyrics': '23 Contributors Translations Deutsch English Français Русский (Russian) Türkçe Português Italiano VOY A LLeVARTE PA PR Lyrics “VOY A LLeVARTE PA PR” sirve como el primer reggaetón presentado en el álbum, en el que Benito realiza una invitación a una mujer con la que se quiere involucrar a visitar Puerto Rico co

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Done embeddings. Example dim: 384


  client.recreate_collection(


Collection 'songs' created with vector size = 384


Preparing points: 100%|██████████| 9665/9665 [00:26<00:00, 360.14it/s]


Upload finished. Total points: 9665
Indexes created for artist, tags, emotion, context.


Nhập mô tả bài hát (ví dụ: 'romantic piano music'):  love song


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Nhập artist cần filter (Enter nếu bỏ qua):  
Nhập tags (cách nhau bằng dấu phẩy, Enter nếu bỏ qua):  


                     Name          Artist  \
0              Love Songs          Clairo   
1               Love song  Sara Bareilles   
2            Love For You     LOVELI LORI   
3            Love Splash!             Joy   
4  fall in love with you.    Montell Fish   

                                                Tags Release  \
0     cover, Love, female vocalists, Olivia, in love    None   
1  pop, female vocalists, piano, sara bareilles, ...    None   
2         dance-pop, dream pop, jersey club, pluggnb    None   
3                                                pop    None   
4                                                pop    None   

                            Emotion   Score  
0  romantic, melancholic, nostalgic  0.7090  
1                          romantic  0.6347  
2               romantic, nostalgic  0.6074  
3                          romantic  0.6007  
4                          romantic  0.5903  
=== MOCK TEST 3: KAGGLE VECTOR PIPELINE ===

[1] Loaded point from Q

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[2] Embedding Test
Vector dim: 384
Has NaN: False

[3] Payload Structure Test
Payload keys: dict_keys(['name', 'description', 'artist', 'tags', 'release_date', 'duration', 'tempo', 'valence', 'danceability', 'producers', 'lyrics', 'summary', 'emotion', 'context'])

[4] Search Flow Test


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Returned results: 3
Sample score: 0.6263037

[5] Filter Logic Test
Filtered result count: 3

MOCK TEST 3 PASSED – KAGGLE PIPELINE OK
