# Baseline Approach v1.0: GTE Embeddings with Sentence Chunking

Hit@5 Score: 0.10

This notebook implements the initial RAG pipeline using GTE embeddings with sentence-level chunking.

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import faiss
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
questions = pd.read_csv('questions_clean.csv')
websites = pd.read_csv('websites_update.csv')
print(f'Questions: {len(questions)}')
print(f'Websites: {len(websites)}')

In [None]:
def chunk_by_sentences(text, max_chars=1000):
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sent in sentences:
        sent = sent.strip()
        if not sent:
            continue
        if current_length + len(sent) > max_chars and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sent]
            current_length = len(sent)
        else:
            current_chunk.append(sent)
            current_length += len(sent)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

doc_ids = []
doc_texts = []
for idx, row in websites.iterrows():
    if idx == 0:
        continue
    chunks = chunk_by_sentences(str(row['text']))
    for chunk in chunks:
        if chunk.strip():
            doc_ids.append(row['web_id'])
            doc_texts.append(chunk)

print(f'Total chunks: {len(doc_texts)}')

In [None]:
model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', device=device)
embeddings = model.encode(doc_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
embeddings = embeddings.astype('float32')
print(f'Embeddings shape: {embeddings.shape}')

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

query_embeddings = model.encode(questions['query'].tolist(), batch_size=64, convert_to_numpy=True).astype('float32')
distances, indices = index.search(query_embeddings, 100)

print(f'Search completed. Shape: {indices.shape}')

In [None]:
doc_ids_array = np.array(doc_ids)
predictions = []
for i in range(len(questions)):
    top_5_chunks = indices[i][:5]
    top_5_web_ids = [doc_ids_array[idx] for idx in top_5_chunks]
    predictions.append(top_5_web_ids)

results = pd.DataFrame({
    'q_id': range(1, len(predictions) + 1),
    'web_list': [str(p) for p in predictions]
})

results.to_csv('submission_baseline_v1.csv', index=False)
print('Submission saved')