# Semantic Retrieval Model Pipeline

Notebook ini membangun model semantic retrieval berbasis FAISS dan SentenceTransformer dari dua dataset MBPP (real & clone).

**Alur utama:**
1. **Data Loading:** Gabungkan dua dataset JSON ke DataFrame.
2. **Embedding:** Buat embedding prompt dengan model multilingual.
3. **Indexing:** Normalisasi embedding dan buat FAISS IndexFlatIP untuk similarity search.
4. **Model Class:** Bungkus DataFrame, index, dan model ke dalam class `SemanticRetrievalModel`.
5. **Simpan Model:** Simpan retrieval model ke file `semantic_retrieval_model.pkl`.
6. **Evaluasi:** Pipeline evaluasi (precision, recall, f1, accuracy) dan hyperparameter tuning.
7. **Contoh Penggunaan:** Cara menggunakan model hasil PKL.

---

## Penjelasan Komponen Penting
- **FAISS IndexFlatIP:**
  - Index similarity berbasis inner product (dot product) yang efisien untuk pencarian embedding.
- **Normalisasi Embedding:**
  - Membuat vektor embedding menjadi unit norm agar inner product setara dengan cosine similarity.
- **SentenceTransformer:**
  - Model pre-trained untuk menghasilkan embedding kalimat multibahasa.
- **joblib:**
  - Untuk serialisasi (save/load) model Python ke file PKL.
- **search(query, top_k):**
  - Fungsi utama untuk mencari top-k prompt paling mirip secara semantik.
- **Pipeline Evaluasi:**
  - Mengukur performa retrieval dengan metrik klasifikasi (f1, precision, recall, accuracy).
- **Hyperparameter tuning:**
  - Mencari nilai top_k terbaik untuk retrieval.


In [None]:
# Install required packages
%pip install sentence-transformers faiss-cpu joblib sentencepiece transformers langdetect

In [None]:
# 1. Load and merge datasets with deduplication and normalization
import pandas as pd
import json
import re
from langdetect import detect, LangDetectException

with open('mbpp_real.json', 'r', encoding='utf-8') as f:
    real_data = json.load(f)
with open('mbpp_clone.json', 'r', encoding='utf-8') as f:
    clone_data = json.load(f)

def to_dataframe(data):
    if isinstance(data, list):
        return pd.DataFrame(data)
    elif isinstance(data, dict):
        return pd.DataFrame(list(data.values()))
    raise ValueError('Unknown data format')

def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text

df_real = to_dataframe(real_data)[['prompt', 'code']]
df_clone = to_dataframe(clone_data)[['prompt', 'code']]
df = pd.concat([df_real, df_clone], ignore_index=True)
df['prompt'] = df['prompt'].apply(normalize_text)
df = df.drop_duplicates(subset='prompt').reset_index(drop=True)
print(f'Loaded {len(df)} unique prompts after deduplication.')

In [None]:
# Load DataFrame & embeddings langsung dari JSON (tanpa generate ulang)
import pandas as pd
import numpy as np
import json

with open('mbpp_all_with_embedding.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)
# Pastikan kolom embedding sudah ada dan bentuknya list of list
embeddings = np.array(df['embedding'].tolist())
print(f"Loaded {len(df)} rows, embedding shape: {embeddings.shape}")

In [None]:
df.head()

In [None]:
# 2a. Load three best embedding models and translation pipeline (with language detection)
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch
import gc

model1 = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model2 = SentenceTransformer('sentence-transformers/LaBSE')
model3 = SentenceTransformer('intfloat/multilingual-e5-base')
translator = pipeline('translation', model='Helsinki-NLP/opus-mt-id-en', device=0 if torch.cuda.is_available() else -1)
gc.collect()

In [None]:
# 2. Generate prompt embeddings (ensemble 3 model + auto-translate Indo->En)
import numpy as np

def translate_if_needed(text):
    try:
        lang = detect(text)
    except LangDetectException:
        lang = 'en'
    if lang == 'id':
        return translator(text)[0]['translation_text']
    return text

def get_ensemble_embedding(text):
    text_en = translate_if_needed(text)
    emb1 = model1.encode([text_en], convert_to_numpy=True)
    emb2 = model2.encode([text_en], convert_to_numpy=True)
    emb3 = model3.encode([text_en], convert_to_numpy=True)
    # Weighted average ensemble (can be tuned)
    emb = np.concatenate([emb1, emb2, emb3], axis=1)
    return emb

prompts = df['prompt'].tolist()
embeddings = np.vstack([get_ensemble_embedding(p) for p in prompts])

In [None]:
# Simpan embedding ke file JSON baru (gabungan real+clone+embedding)
import json
df['embedding'] = [emb.tolist() for emb in embeddings]  # pastikan urutan sama
with open('mbpp_all_with_embedding.json', 'w', encoding='utf-8') as f:
    json.dump(df.to_dict(orient='records'), f, ensure_ascii=False, indent=2)

In [None]:
# 3. Normalize embeddings and build FAISS index (ensemble)
import numpy as np
import faiss

embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

In [None]:
# 4. Define SemanticRetrievalModel class (ensemble 3 model + auto-translate)
import joblib

class SemanticRetrievalModel:
    def __init__(self, df, index, embeddings, encoder_func, best_k=5):
        self.df = df
        self.index = index
        self.embeddings = embeddings
        self.encoder_func = encoder_func
        self.best_k = best_k

    def search(self, query: str, top_k: int = None):
        if top_k is None:
            top_k = self.best_k
        emb = self.encoder_func(query)
        emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
        D, I = self.index.search(emb, top_k)
        results = self.df.iloc[I[0]].copy()
        results['score'] = D[0]
        return results[['prompt', 'score', 'code']]

retrieval_model = SemanticRetrievalModel(df, index, embeddings, get_ensemble_embedding)

In [None]:



from tqdm import tqdm
import numpy as np
import random
import matplotlib.pyplot as plt

N = 3  # Atur N sesuai kebutuhan
relevance_dict = {}
similarity_scores = []  # Store average similarity for each query
topN_prompts = []  # Store top-N prompt texts for each query

for idx, row in tqdm(df.iterrows(), total=len(df)):
    query = row['prompt']
    results = retrieval_model.search(query, top_k=N+1)  # +1 untuk menghindari self-match
    result_indices = results.index.tolist()
    result_scores = results['score'].tolist()
    if idx in result_indices:
        remove_idx = result_indices.index(idx)
        result_indices.pop(remove_idx)
        result_scores.pop(remove_idx)
    relevance_dict[idx] = result_indices[:N]
    similarity_scores.append(np.mean(result_scores[:N]))
    topN_prompts.append(df.loc[result_indices[:N], 'prompt'].tolist())

# Tambahkan ke DataFrame untuk analisis manual
df['relevant_indices'] = df.index.map(relevance_dict)
df['avg_topN_similarity'] = similarity_scores
df['topN_prompts'] = topN_prompts

# Tampilkan contoh mapping relevansi dengan prompt dan skor
print('Contoh mapping relevansi (prompt, relevant prompts, avg similarity):')
for i in random.sample(range(len(df)), min(5, len(df))):
    prompt = df.loc[i, 'prompt']
    relevant_prompts = df.loc[i, 'topN_prompts']
    avg_sim = df.loc[i, 'avg_topN_similarity']
    print(f'Query: {prompt}\nRelevant: {relevant_prompts}\nAvg sim: {avg_sim:.3f}\n---')

# Visualisasi distribusi skor similarity
plt.figure(figsize=(7,4))
plt.hist(df['avg_topN_similarity'], bins=20, alpha=0.7, color='royalblue')
plt.xlabel('Average Top-N Similarity')
plt.ylabel('Frequency')
plt.title('Distribution of Average Top-N Similarity per Query')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
# --- Save DataFrame with relevance mapping to JSON (for fast reload, skip recompute) ---
import json

df_to_save = df.copy()
# Convert numpy types and lists to native Python types for JSON serialization
for col in ['relevant_indices', 'topN_prompts']:
    df_to_save[col] = df_to_save[col].apply(lambda x: list(map(int, x)) if col == 'relevant_indices' else x)

df_to_save['avg_topN_similarity'] = df_to_save['avg_topN_similarity'].astype(float)

with open('mbpp_all_with_embedding_and_relevance.json', 'w', encoding='utf-8') as f:
    json.dump(df_to_save.to_dict(orient='records'), f, ensure_ascii=False, indent=2)

print('Saved DataFrame with relevance mapping to mbpp_all_with_embedding_and_relevance.json')

In [None]:
# --- Enhanced Evaluation: Retrieval Metrics, Error Analysis, and Diversity ---
if 'relevance_dict' not in globals():
    raise RuntimeError('Jalankan cell Automatic Relevance Mapping (cell sebelumnya) terlebih dahulu!')

from sklearn.metrics import f1_score, precision_score, recall_score
from collections import Counter

best_f1 = 0
best_k = 1
results_dict = {}
error_cases = []
diversity_scores = []
for top_k in range(1, 11):
    y_true = []
    y_pred = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        query = row['prompt']
        relevant_indices = relevance_dict[idx]
        results = retrieval_model.search(query, top_k=top_k)
        result_indices = results.index.tolist()
        match = any(i in result_indices for i in relevant_indices)
        y_true.append(1)
        y_pred.append(1 if match else 0)
        # Error analysis: log false negatives
        if not match:
            error_cases.append({'query': query, 'relevant': [df.loc[i, 'prompt'] for i in relevant_indices], 'retrieved': [df.loc[i, 'prompt'] for i in result_indices]})
        # Diversity: unique prompt count in retrieval
        diversity_scores.append(len(set(result_indices)))
    f1 = f1_score(y_true, y_pred)
    results_dict[top_k] = f1
    if f1 > best_f1:
        best_f1 = f1
        best_k = top_k
print(f'Best top_k: {best_k} with F1-score: {best_f1:.4f}')
print('F1-scores by top_k:', results_dict)
print(f'Average retrieval diversity (unique prompts per query): {np.mean(diversity_scores):.2f}')
if error_cases:
    print(f'Example error case:')
    print('Query:', error_cases[0]['query'])
    print('Relevant:', error_cases[0]['relevant'])
    print('Retrieved:', error_cases[0]['retrieved'])

In [None]:
# 5. Save the model (with best_k from tuning)
# Pastikan variabel best_k dan tqdm sudah didefinisikan sebelum cell ini dijalankan
try:
    _ = best_k
except NameError:
    raise RuntimeError('Jalankan cell evaluasi hyperparameter (yang mendefinisikan best_k) terlebih dahulu!')
try:
    from tqdm import tqdm
except ImportError:
    raise ImportError('tqdm belum terinstall. Jalankan !pip install tqdm atau %pip install tqdm')

retrieval_model.best_k = best_k
import joblib
joblib.dump(retrieval_model, 'semantic_retrieval_model.pkl')


In [None]:
# 6. Load and test the saved model with a user query
import joblib

# Path PKL harus sesuai lokasi file hasil dump
loaded_model = joblib.load('semantic_retrieval_model.pkl')
user_query = "find the maximum value in a list"
results = loaded_model.search(user_query, top_k=3)
print(results)


In [None]:
loaded_model = joblib.load('semantic_retrieval_model.pkl')
user_query = "Temukan nilai maksimum dalam sebuah list"
results = loaded_model.search(user_query, top_k=3)
print(results)
