In [1]:
# @title Phase 0 (Corrected, v4): Universal Language Loading
# -----------------------------------------------------------------------------
# This version incorporates the new requirement to load ALL documents,
# including English and other languages, to create a comprehensive knowledge base.
# We have removed all path-based language filtering.
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------
# 1. INSTALL DEPENDENCIES
# -----------------------------------------------------------------------------
print("STEP 1: Installing all required libraries...")
# Using -q for a quieter installation in Colab
print("Installing core ML/NLP libraries...")
!pip install -q -U transformers sentence-transformers

print("Installing vector database and search libraries...")
!pip install -q faiss-cpu

print("Installing the OpenAI library for API interaction...")
!pip install -q openai

print("Installing utility libraries for data processing...")
# We do not need langdetect
!pip install -q markdown beautifulsoup4 jieba pandas tqdm

print("\n‚úÖ All dependencies installed successfully!")

# -----------------------------------------------------------------------------
# 2. CONFIGURE API KEY AND CLIENT
# -----------------------------------------------------------------------------
print("\nSTEP 2: Configuring DeepSeek Client...")
import os
from google.colab import userdata
from openai import OpenAI

try:
    DEEPSEEK_API_KEY = userdata.get('DEEPSEEK_API_KEY')
    print("‚úÖ Successfully loaded DEEPSEEK_API_KEY from Colab secrets.")

    client = OpenAI(
        api_key=DEEPSEEK_API_KEY,
        base_url="https://api.deepseek.com/v1"
    )
    print("‚úÖ DeepSeek client initialized successfully via OpenAI compatibility layer.")

except (ImportError, userdata.SecretNotFoundError):
    print("üõë Error: Secret 'DEEPSEEK_API_KEY' not found.")
    print("Please ensure you have added your API key to Colab's secrets manager.")
    client = None
except Exception as e:
    print(f"üõë An error occurred during client initialization: {e}")
    client = None

# -----------------------------------------------------------------------------
# 3. CLONE THE DEEPIN WIKI REPOSITORY
# -----------------------------------------------------------------------------
print("\nSTEP 3: Cloning the Deepin Wiki repository...")
repo_url = "https://github.com/linuxdeepin/wiki.deepin.org"
local_repo_path = "deepin_wiki"

if not os.path.exists(local_repo_path):
    print(f"Cloning wiki repository from {repo_url}...")
    !git clone {repo_url} {local_repo_path}
    print("\n‚úÖ Repository cloned successfully.")
else:
    print(f"‚úÖ Repository already exists at '{local_repo_path}'. Skipping clone.")

# -----------------------------------------------------------------------------
# 4. LOAD ALL DOCUMENTS (UNIVERSAL METHOD)
# -----------------------------------------------------------------------------
print("\nSTEP 4: Loading ALL documents from the repository...")
from tqdm.notebook import tqdm
import pandas as pd

def load_all_documents(root_path):
    raw_documents = []
    base_url = "https://wiki.deepin.org/"

    file_paths = []
    for dirpath, _, filenames in os.walk(root_path):
        # We should also exclude git metadata files
        if '.git' in dirpath:
            continue
        for filename in filenames:
            if filename.endswith(".md"):
                file_paths.append(os.path.join(dirpath, filename))

    print(f"Found {len(file_paths)} total .md files. Loading all of them...")

    for file_path in tqdm(file_paths, desc="Processing files"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # We still keep the minimum length check to avoid empty/stub files.
            if len(content.strip()) < 50:
                continue

            # **CHANGE**: The language-based path filtering has been completely removed.
            # Every valid markdown file will now be loaded.

            relative_path = os.path.relpath(file_path, root_path).replace('\\', '/')
            if relative_path.endswith('.md'):
                relative_path = relative_path[:-3]
            if os.path.basename(relative_path) == 'README':
                relative_path = os.path.dirname(relative_path)

            url = f"{base_url}{relative_path}" if relative_path != '.' else base_url

            raw_documents.append({
                "content": content,
                "meta": {
                    "source_path": file_path,
                    "url": url,
                    "page_title": os.path.basename(relative_path) or "Home"
                }
            })
        except Exception as e:
            print(f"Skipping file due to error processing {file_path}: {e}")

    return raw_documents

# --- Execution ---
raw_docs = load_all_documents(local_repo_path)
print(f"\n‚úÖ Processing complete. Loaded {len(raw_docs)} documents into our knowledge base.")

# Display a sample of the loaded data
if raw_docs:
    print("\n--- Sample Document ---")
    # Find a non-empty doc for a better sample
    sample_doc = next((doc for doc in raw_docs if doc['content']), raw_docs[0])
    print(f"Page Title: {sample_doc['meta']['page_title']}")
    print(f"URL: {sample_doc['meta']['url']}")
    print(f"Content Preview:\n{sample_doc['content'][:400]}...")



STEP 1: Installing all required libraries...
Installing core ML/NLP libraries...
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.1/40.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m11.6/11.6 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m486.6/486.6 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling vector database and search libraries...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.4/31.4 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling the OpenAI library for API interaction...
Inst

Processing files:   0%|          | 0/1168 [00:00<?, ?it/s]


‚úÖ Processing complete. Loaded 1168 documents into our knowledge base.

--- Sample Document ---
Page Title: home
URL: https://wiki.deepin.org/home
Content Preview:
---
title: deepin Wiki - Ê∑±Â∫¶ÁôæÁßë
description: deepinÔºàÊ∑±Â∫¶ÔºâÁ≥ªÁªü‰ª•ÂèäÁ§æÂå∫ÁöÑÁÆÄÁü≠‰ªãÁªç
published: true
date: 2025-02-28T07:30:27.927Z
tags: deepinÂéÜÂè≤, deepinÁ§æÂå∫, deepinÁ≥ªÁªü, deepinÁªÑÁªáÁªìÊûÑ, deepinË¥°ÁåÆ
editor: markdown
dateCreated: 2022-06-08T09:01:18.650Z
---

![deepin_logo_1.png](/deepin_logo_2.png)

# ‰∏Ä„ÄÅÁ§æÂå∫ÁÆÄ‰ªã

## 1.1 deepinÔºàÊ∑±Â∫¶ÔºâÁ§æÂå∫
deepinÁ§æÂå∫Ëã±ÊñáÂêçÁß∞Ôºödeepin communityÔºà‰ª•‰∏ãÁÆÄÁß∞‚ÄúÁ§æÂå∫‚ÄùÔºâÊòØÁî±‰ºÅ‰∏öÂçï‰Ωç„ÄÅ‰∫ã‰∏öÂçï‰Ωç„ÄÅÁ§æ‰ºöÂõ¢‰Ωì„ÄÅ‰∏™‰∫∫ÂºÄÂèëËÄÖÁ≠âÔºåÂü∫‰∫éÂÖ±Âª∫„ÄÅÂÖ±Ê≤ª„ÄÅÂÖ±‰∫´ÂéüÂàôÁªÑÂª∫ÁöÑÂºÄÊ∫êÁ§æÂå∫„ÄÇ

## 1.2 Á§æÂå∫ÂéÜÂè≤
- ...


In [2]:
# @title GPU Verification Cell
import torch

print("--- Verifying GPU Access ---")

# Check 1: Is a GPU visible to the underlying system?
# This command talks directly to the NVIDIA drivers.
!nvidia-smi

print("\n--- Verifying GPU Access for PyTorch ---")

# Check 2: Can the PyTorch library see and use the GPU?
is_available = torch.cuda.is_available()
print(f"Is CUDA (GPU) available for PyTorch? -> {is_available}")

if is_available:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: PyTorch cannot find the GPU. Do not proceed. Try another Factory Reset.")


--- Verifying GPU Access ---
Mon Sep 22 12:32:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                   

In [3]:
# @title Phase 0.5: Split Documents into Chunks
# -----------------------------------------------------------------------------
# 1. IMPORT THE NECESSARY TEXT SPLITTER
# -----------------------------------------------------------------------------
print("STEP 1: Importing text splitting libraries...")
# We need to install the library that contains the text splitter
!pip install -q langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
print("‚úÖ Text splitting libraries imported.")

# -----------------------------------------------------------------------------
# 2. CONFIGURE THE TEXT SPLITTER
# -----------------------------------------------------------------------------
print("\nSTEP 2: Configuring the text splitter for Markdown...")
# RecursiveCharacterTextSplitter is excellent for code and markdown.
# It tries to split on markdown-native separators first (like headers, code blocks).
# - chunk_size: The maximum size of each chunk (in characters).
# - chunk_overlap: How many characters to overlap between chunks. This helps
#   maintain context so that a sentence isn't split awkwardly between two chunks.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100,
    # These are common markdown separators, in order of importance
    separators=["\n\n", "\n", " ", "", "„ÄÇ", "Ôºå", "„ÄÅ"]
)
print(f"‚úÖ Splitter configured with chunk_size={text_splitter._chunk_size} and chunk_overlap={text_splitter._chunk_overlap}.")

# -----------------------------------------------------------------------------
# 3. PROCESS THE DOCUMENTS AND CREATE SPLITS
# -----------------------------------------------------------------------------
print("\nSTEP 3: Splitting all loaded documents...")
all_splits = []
# Using tqdm for a progress bar
from tqdm.notebook import tqdm

# We iterate through each document we loaded in Phase 0
for doc in tqdm(raw_docs, desc="Splitting documents"):
    # The main splitting operation
    chunks = text_splitter.split_text(doc['content'])

    # For each chunk, we create a new entry with the original metadata
    for i, chunk_text in enumerate(chunks):
        all_splits.append({
            "content": chunk_text,
            "meta": {
                **doc['meta'], # Copy original metadata (URL, title, etc.)
                "chunk_num": i + 1 # Add the chunk number for reference
            }
        })

print(f"\n‚úÖ Splitting complete. Created {len(all_splits)} document splits from {len(raw_docs)} raw documents.")

# Display a sample of the created splits
if all_splits:
    print("\n--- Sample Document Split (Chunk) ---")
    sample_split = all_splits[10] # Show the 11th chunk as a sample
    print(f"Original Page Title: {sample_split['meta']['page_title']}")
    print(f"URL: {sample_split['meta']['url']}")
    print(f"Chunk Number: {sample_split['meta']['chunk_num']}")
    print(f"Chunk Content Preview:\n---\n{sample_split['content']}\n---")


STEP 1: Importing text splitting libraries...
‚úÖ Text splitting libraries imported.

STEP 2: Configuring the text splitter for Markdown...
‚úÖ Splitter configured with chunk_size=700 and chunk_overlap=100.

STEP 3: Splitting all loaded documents...


Splitting documents:   0%|          | 0/1168 [00:00<?, ?it/s]


‚úÖ Splitting complete. Created 7683 document splits from 1168 raw documents.

--- Sample Document Split (Chunk) ---
Original Page Title: home
URL: https://wiki.deepin.org/home
Chunk Number: 11
Chunk Content Preview:
---
## 5.3 ÂèÇ‰∏éÂÜÖÊµã
Âç≥‰æø‰Ω†‰∏çÂñÑ‰∫éÁ†îÂèëÂ∑•‰ΩúÔºå‰πüÊó†ÂøÉ‰∫éÊñáÊ°£Âª∫ËÆæÔºåÂè™Ë¶Å‰Ω†Âú®‰ΩøÁî®deepinÁ§æÂå∫ÁöÑ‰∫ßÂìÅÔºåÂ∞±ÂèØ‰ª•ÂèÇ‰∏éÊµãËØïË¥°ÁåÆÔºåÁõÆÂâçÂèÇ‰∏éÊµãËØïË¥°ÁåÆÁöÑÈÄîÂæÑÊúâÔºö
- Âä†ÂÖ•ÂÆòÊñπÂÜÖÊµãÁæ§ÔºåÈÄöËøáÂÜÖÊµãÁæ§Áõ¥Êé•ÂèçÈ¶àÈóÆÈ¢òÂíåÂª∫ËÆÆÔºõ
![image.png](https://wiki.deepin.org/06_%E5%85%B3%E4%BA%8EDeepin/img-20230906163238.png)
- Âä†ÂÖ•deepinËÆ∫ÂùõÔºåÂú®ËÆ∫ÂùõÂÜÖÁõ¥Êé•ÂèëÂ∏ñÊåáÂá∫ÈóÆÈ¢òÂíåÊèêÂá∫Âª∫ËÆÆÔºõ
- ÈÄöËøáÊ∑±Â∫¶‰πãÂÆ∂Â∫îÁî®ÂèçÈ¶àÈóÆÈ¢òÂíåÂª∫ËÆÆ Ôºõ

## 5.4 ÂèÇ‰∏éÁ§æÂå∫Ê¥ªÂä®
ÁõÆÂâçdeepinÁ§æÂå∫ÊúÄÈáçË¶ÅÁöÑÊ¥ªÂä®ÊòØ‰∏ÄÂπ¥‰∏ÄÂ∫¶ÁöÑ [DDUC](https://wiki.deepin.org/zh/06_%E5%85%B3%E4%BA%8EDeepin/Deepin%E6%B4%BB%E5%8A%A8/DDUC)ÔºàDeepin Developer&User ConferenceÔºâÂ§ß‰ºö„ÄÇÂêåÊó∂ÔºådeepinÁ§æÂå∫Â∑≤ÂùöÊåÅÂ§öÂπ¥ÔºåÂü∫Êú¨‰∏äÊØè‰∏™ÊúàÈÉΩÂú®Ê≠¶Ê±âÂú∞Âå

In [4]:
# @title Phase 1 (Corrected): Build the Hybrid Search Index
# -----------------------------------------------------------------------------
# This phase creates the two powerful search indices we need:
# 1. Dense Index (FAISS): Understands the *meaning* of the query.
# 2. Sparse Index (BM25): Finds exact *keyword* matches.
# -----------------------------------------------------------------------------
import numpy as np
import faiss
import jieba
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import torch

# -----------------------------------------------------------------------------
# 1. SETUP EMBEDDING MODEL
# -----------------------------------------------------------------------------
print("STEP 1: Loading the sentence embedding model...")

# Check if GPU is available and set the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# We'll use a powerful, open-source model optimized for Chinese text.
# BAAI/bge-large-zh-v1.5 is a top-tier choice.
model_name = 'BAAI/bge-large-zh-v1.5'
try:
    embedding_model = SentenceTransformer(model_name, device=device)
    print(f"‚úÖ Successfully loaded '{model_name}' model.")
except Exception as e:
    print(f"üõë Error loading model: {e}")
    embedding_model = None

# -----------------------------------------------------------------------------
# 2. BUILD THE DENSE INDEX (FAISS)
# -----------------------------------------------------------------------------
# This index stores the semantic vectors of all document chunks.

if embedding_model:
    print("\nSTEP 2: Building the Dense Vector Index (FAISS)...")

    # Extract the text content from all our splits
    corpus_content = [split['content'] for split in all_splits]

    print(f"Generating embeddings for {len(corpus_content)} chunks. This may take a few minutes...")

    # Generate embeddings in batches for efficiency
    # The model will automatically use the GPU if available.
    embeddings = embedding_model.encode(
        corpus_content,
        batch_size=32, # Adjust batch size based on Colab GPU memory
        show_progress_bar=True,
        normalize_embeddings=True # Normalizing is good practice for similarity search
    )

    print("\nEmbeddings generated successfully.")
    print(f"Shape of embeddings matrix: {embeddings.shape}")

    # Create the FAISS index
    embedding_dim = embeddings.shape[1]
    # We use IndexFlatL2, a standard and effective index for dense vectors.
    dense_index = faiss.IndexFlatL2(embedding_dim)

    # Add the generated vectors to the index
    dense_index.add(embeddings.astype('float32'))

    print(f"‚úÖ FAISS index built successfully. Total vectors in index: {dense_index.ntotal}")
else:
    print("\nSkipping Dense Index creation due to model loading failure.")
    dense_index = None

# -----------------------------------------------------------------------------
# 3. BUILD THE SPARSE INDEX (BM25)
# -----------------------------------------------------------------------------
# This index is for keyword-based search.
print("\nSTEP 3: Building the Sparse Keyword Index (BM25)...")

# --- FIX: INSTALL THE MISSING LIBRARY ---
print("Installing rank_bm25 library...")
!pip install -q rank_bm25

from rank_bm25 import BM25Okapi

# We need to tokenize the text for BM25. Jieba is a great Chinese tokenizer.
print("Tokenizing corpus for BM25 using jieba...")
tokenized_corpus = [list(jieba.cut(doc['content'])) for doc in tqdm(all_splits, desc="Tokenizing")]

# Create and train the BM25 index
sparse_index = BM25Okapi(tokenized_corpus)

print("‚úÖ BM25 index built successfully.")

# -----------------------------------------------------------------------------
# FINAL STATUS
# -----------------------------------------------------------------------------
print("\n--- üèÅ PHASE 1 COMPLETE ---")
if dense_index and sparse_index:
    print("‚úÖ Both Dense (FAISS) and Sparse (BM25) indices are built and ready for querying.")
else:
    print("üõë One or more indices failed to build. Please review the errors above.")


  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")


STEP 1: Loading the sentence embedding model...
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

‚úÖ Successfully loaded 'BAAI/bge-large-zh-v1.5' model.

STEP 2: Building the Dense Vector Index (FAISS)...
Generating embeddings for 7683 chunks. This may take a few minutes...


Batches:   0%|          | 0/241 [00:00<?, ?it/s]


Embeddings generated successfully.
Shape of embeddings matrix: (7683, 1024)
‚úÖ FAISS index built successfully. Total vectors in index: 7683

STEP 3: Building the Sparse Keyword Index (BM25)...
Installing rank_bm25 library...
Tokenizing corpus for BM25 using jieba...


Tokenizing:   0%|          | 0/7683 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.829 seconds.
DEBUG:jieba:Loading model cost 0.829 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


‚úÖ BM25 index built successfully.

--- üèÅ PHASE 1 COMPLETE ---
‚úÖ Both Dense (FAISS) and Sparse (BM25) indices are built and ready for querying.


In [5]:
# @title Phase 2: Hybrid Retrieval and Reranking Pipeline
# -----------------------------------------------------------------------------
# This phase sets up the functions to perform a smart search.
# 1. Hybrid Search: Combines keyword and semantic search results.
# 2. Reranking: Uses a powerful model to refine the search results for maximum relevance.
# -----------------------------------------------------------------------------
from sentence_transformers import CrossEncoder
import torch

# -----------------------------------------------------------------------------
# 1. SETUP THE RERANKER MODEL
# -----------------------------------------------------------------------------
print("STEP 1: Loading the Reranker model...")

# Check for GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# BAAI/bge-reranker-large is a state-of-the-art model for this task.
# A reranker (or CrossEncoder) is different from the embedding model. It takes
# a query and a document and directly outputs a relevance score (e.g., 0.98).
# It's much more accurate than the initial search but too slow to run on all documents.
try:
    reranker_model = CrossEncoder('BAAI/bge-reranker-large', max_length=512, device=device)
    print("‚úÖ Successfully loaded 'BAAI/bge-reranker-large' model.")
except Exception as e:
    print(f"üõë Error loading reranker model: {e}")
    reranker_model = None

# -----------------------------------------------------------------------------
# 2. IMPLEMENT THE HYBRID SEARCH FUNCTION (with RRF)
# -----------------------------------------------------------------------------
print("\nSTEP 2: Defining the Hybrid Search function...")

def hybrid_search(query, dense_index, sparse_index, embedding_model, all_splits, k_dense=30, k_sparse=30, rrf_k=60):
    """
    Performs a hybrid search using both dense and sparse indices, then fuses the results.
    """
    # --- 1. Dense Search (FAISS) ---
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    _, dense_indices = dense_index.search(query_embedding.astype('float32'), k_dense)
    dense_indices = dense_indices[0]

    # --- 2. Sparse Search (BM25) ---
    tokenized_query = list(jieba.cut(query))
    sparse_scores = sparse_index.get_scores(tokenized_query)
    # Get top k sparse results, but keep all scores for potential overlap
    sparse_indices_scores = sorted(enumerate(sparse_scores), key=lambda x: x[1], reverse=True)[:k_sparse]
    sparse_indices = [item[0] for item in sparse_indices_scores]

    # --- 3. Reciprocal Rank Fusion (RRF) ---
    # RRF is a simple and effective way to combine ranked lists without needing to tune weights.
    # It prioritizes documents that appear high up in *either* list.
    fused_scores = {}

    # Process dense results
    for rank, doc_id in enumerate(dense_indices):
        if doc_id not in fused_scores:
            fused_scores[doc_id] = 0
        fused_scores[doc_id] += 1 / (rrf_k + rank)

    # Process sparse results
    for rank, doc_id in enumerate(sparse_indices):
        if doc_id not in fused_scores:
            fused_scores[doc_id] = 0
        fused_scores[doc_id] += 1 / (rrf_k + rank)

    # Sort the fused results by their combined score
    reranked_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)

    # Return just the document indices
    final_indices = [doc_id for doc_id, score in reranked_results]
    return final_indices

print("‚úÖ Hybrid Search function is ready.")

# -----------------------------------------------------------------------------
# 3. IMPLEMENT THE RERANKING FUNCTION
# -----------------------------------------------------------------------------
print("\nSTEP 3: Defining the Reranking function...")

def rerank_documents(query, retrieved_indices, all_splits, reranker_model, top_n=5):
    """
    Reranks the retrieved documents using a powerful CrossEncoder model.
    """
    if not reranker_model:
        print("‚ö†Ô∏è Reranker model not loaded. Returning top_n results from initial retrieval.")
        return [all_splits[i] for i in retrieved_indices[:top_n]]

    # Create pairs of [query, document_content] for the model
    pairs = []
    for doc_id in retrieved_indices:
        pairs.append([query, all_splits[doc_id]['content']])

    # Predict scores for all pairs. show_progress_bar is helpful for seeing progress.
    scores = reranker_model.predict(pairs, show_progress_bar=False)

    # Combine documents with their new scores
    scored_docs = []
    for i, doc_id in enumerate(retrieved_indices):
        doc = all_splits[doc_id]
        doc['rerank_score'] = scores[i]
        scored_docs.append(doc)

    # Sort documents by the new rerank score in descending order
    sorted_docs = sorted(scored_docs, key=lambda x: x['rerank_score'], reverse=True)

    # Return the top N documents
    return sorted_docs[:top_n]

print("‚úÖ Reranking function is ready.")

# -----------------------------------------------------------------------------
# 4. TEST THE FULL PIPELINE
# -----------------------------------------------------------------------------
print("\nSTEP 4: Running a test query through the full pipeline...")

# A sample query relevant to the Deepin Wiki
test_query = "Â¶Ç‰ΩïËÆæÁΩÆ‰ªªÂä°Ê†è"

# --- Execute the pipeline ---
# 1. Get initial candidates from our hybrid search
hybrid_results_indices = hybrid_search(
    test_query, dense_index, sparse_index, embedding_model, all_splits
)

# 2. Rerank these candidates to get the best final results
final_reranked_docs = rerank_documents(
    test_query, hybrid_results_indices, all_splits, reranker_model, top_n=5
)

print(f"\n--- ‚úÖ Test Complete. Top 5 Reranked Results for Query: '{test_query}' ---")

# --- Display the results nicely ---
if final_reranked_docs:
    for i, doc in enumerate(final_reranked_docs):
        print(f"\n--- Result {i+1} ---")
        print(f"Relevance Score: {doc['rerank_score']:.4f}")
        print(f"Source Page: {doc['meta']['page_title']}")
        print(f"URL: {doc['meta']['url']}")
        print(f"Chunk Content:\n---\n{doc['content']}\n---")
else:
    print("\nNo results found for the test query.")



STEP 1: Loading the Reranker model...
Using device: cuda


config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

‚úÖ Successfully loaded 'BAAI/bge-reranker-large' model.

STEP 2: Defining the Hybrid Search function...
‚úÖ Hybrid Search function is ready.

STEP 3: Defining the Reranking function...
‚úÖ Reranking function is ready.

STEP 4: Running a test query through the full pipeline...

--- ‚úÖ Test Complete. Top 5 Reranked Results for Query: 'Â¶Ç‰ΩïËÆæÁΩÆ‰ªªÂä°Ê†è' ---

--- Result 1 ---
Relevance Score: 0.9638
Source Page: Ê∑±Â∫¶Ê°åÈù¢‰ªãÁªç
URL: https://wiki.deepin.org/ÂæÖÂàÜÁ±ª/01_deepinÂÖ•Èó®/Ê∑±Â∫¶Ê°åÈù¢‰ªãÁªç
Chunk Content:
---
### ÂàáÊç¢ÊòæÁ§∫Ê®°Âºè

‰ªªÂä°Ê†èÊèê‰æõ‰∏§ÁßçÊòæÁ§∫Ê®°ÂºèÔºöÊó∂Â∞öÊ®°ÂºèÂíåÈ´òÊïàÊ®°Âºè„ÄÇ

- **Êó∂Â∞öÊ®°Âºè**Ôºö‰ªªÂä°Ê†èÁöÑÊòæÁ§∫È£éÊ†ºÁ±ª‰ºº‰∫éMac OSÔºå‰ª•ÊâòÁõòÁöÑÂΩ¢ÂºèÂÅúÈù†Âú®Â±èÂπï‰∏ãÊñπ„ÄÇ‰ªªÂä°Ê†è‰∏ä‰ºöÊòæÁ§∫ÊâÄÊúâÂõ∫ÂÆöÂú®‰ªªÂä°Ê†èÁöÑÂ∫îÁî®ÂõæÊ†á„ÄÇ
- **È´òÊïàÊ®°Âºè**Ôºö‰ªªÂä°Ê†èÁöÑÊòæÁ§∫È£éÊ†ºÁ±ª‰ºº‰∫éWindows 7Ôºå‰ª•Â∞èÈïøÊù°ÁöÑÂΩ¢ÂºèÊòæÁ§∫Âú®Â±èÂπï‰∏ãÊñπ„ÄÇÂõ∫ÂÆöÂú®‰ªªÂä°Ê†è‰∏äÁöÑÂ∫îÁî®ÂõæÊ†áÈªòËÆ§‰∏∫Â∞èÂõæÊ†áÊòæÁ§∫ÔºåËÄå‰∏îÁõ∏ÂêåÁ±ªÂûãÁöÑÂõæÊ†á‰ºöÂêàÂ

In [6]:
# @title Phase 3: Generation with LLM and Final Application
# -----------------------------------------------------------------------------
# This is the final phase where we generate a human-readable answer.
# 1. We take the top-ranked context from Phase 2.
# 2. We construct a precise prompt for the LLM.
# 3. We call the DeepSeek API to get the final answer, citing sources.
# -----------------------------------------------------------------------------
import os
from openai import OpenAI
from google.colab import userdata
from IPython.display import display, Markdown

# -----------------------------------------------------------------------------
# 1. SETUP THE LLM CLIENT (DEEPSEEK)
# -----------------------------------------------------------------------------
print("STEP 1: Setting up the DeepSeek LLM Client...")

# Securely get the API key from Colab secrets or environment variables
try:
    # For Google Colab
    DEEPSEEK_API_KEY = userdata.get('DEEPSEEK_API_KEY')
except (ImportError, userdata.SecretNotFoundError):
    # For local environment
    print("Could not find Colab userdata. Falling back to environment variables.")
    DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY')

if not DEEPSEEK_API_KEY:
    print("üõë CRITICAL: DEEPSEEK_API_KEY not found.")
    print("Please add it to your Colab Secrets (under the üîë icon) or set it as an environment variable.")
    llm_client = None
else:
    try:
        llm_client = OpenAI(
            api_key=DEEPSEEK_API_KEY,
            base_url="https://api.deepseek.com/v1"
        )
        print("‚úÖ DeepSeek client initialized successfully.")
    except Exception as e:
        print(f"üõë Error initializing DeepSeek client: {e}")
        llm_client = None

# -----------------------------------------------------------------------------
# 2. DEFINE THE PROMPT TEMPLATE
# -----------------------------------------------------------------------------
print("\nSTEP 2: Creating the prompt template...")

# This prompt is engineered to force the LLM to use only the provided context
# and to cite its sources, which is crucial for a reliable RAG system.
PROMPT_TEMPLATE = """
**Role:** You are a Deepin OS technical support expert. Your sole purpose is to answer user questions accurately based on the official Deepin Wiki documentation provided.

**Instructions:**
1.  Carefully read the user's **Query** and the **Provided Context** below.
2.  Formulate a clear, concise, and helpful answer to the query.
3.  **Crucially, you must base your answer *exclusively* on the information within the Provided Context.** Do not use any prior knowledge or information from outside this context.
4.  After providing the answer, you **must** cite the sources you used from the context. List them under a "Sources:" heading. Use the format: `[Source X]: [Page Title] ([URL])`.
5.  If the Provided Context does not contain enough information to answer the query, you must respond with: "Êä±Ê≠âÔºåÊ†πÊçÆÊèê‰æõÁöÑÊñáÊ°£‰ø°ÊÅØÔºåÊàëÊó†Ê≥ïÂõûÁ≠îÊÇ®ÁöÑÈóÆÈ¢ò„ÄÇ" (Sorry, based on the provided document information, I cannot answer your question.) Do not try to guess.

---
**Provided Context:**
{context_string}
---
**User Query:** {query}
---
**Your Answer:**
"""

print("‚úÖ Prompt template is ready.")

# -----------------------------------------------------------------------------
# 3. CREATE THE FINAL RAG APPLICATION FUNCTION
# -----------------------------------------------------------------------------
print("\nSTEP 3: Building the final 'ask' function...")

def ask_deepin_wiki(query: str):
    """
    The main function that orchestrates the entire RAG pipeline.
    """
    if not llm_client:
        print("Cannot proceed: LLM client is not initialized.")
        return

    print(f"üîç Processing query: '{query}'")

    # 1. Hybrid Search (Phase 2)
    print("   - Step 1: Performing hybrid search...")
    hybrid_indices = hybrid_search(
        query, dense_index, sparse_index, embedding_model, all_splits, k_dense=50, k_sparse=50
    )

    # 2. Reranking (Phase 2)
    print("   - Step 2: Reranking search results...")
    final_docs = rerank_documents(
        query, hybrid_indices, all_splits, reranker_model, top_n=5
    )

    if not final_docs:
        print("   - ‚ÄºÔ∏è No relevant documents found after reranking.")
        display(Markdown("Êä±Ê≠âÔºåÊú™ËÉΩÂú®Áü•ËØÜÂ∫ì‰∏≠ÊâæÂà∞‰∏éÊÇ®ÈóÆÈ¢òÁõ∏ÂÖ≥ÁöÑ‰ø°ÊÅØ„ÄÇ"))
        return

    # 3. Context & Prompt Construction
    print("   - Step 3: Constructing prompt for LLM...")
    context_string = ""
    # Create a unique list of sources for citation
    sources = {}
    for i, doc in enumerate(final_docs):
        context_string += f"--- Context Snippet {i+1} ---\n"
        context_string += f"Source URL: {doc['meta']['url']}\n"
        context_string += f"Page Title: {doc['meta']['page_title']}\n"
        context_string += f"Content: {doc['content']}\n\n"

        # Add to sources dictionary to avoid duplicates
        source_key = doc['meta']['url']
        if source_key not in sources:
            sources[source_key] = f"[{len(sources)+1}]: {doc['meta']['page_title']} ({doc['meta']['url']})"

    final_prompt = PROMPT_TEMPLATE.format(context_string=context_string, query=query)

    # 4. LLM Generation
    print("   - Step 4: Sending request to DeepSeek and streaming response...")
    print("\n" + "="*50)
    print("Deepin OS ‰∏ìÂÆ∂ÂõûÁ≠îÔºö")
    print("="*50)

    try:
        response_stream = llm_client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": final_prompt}
            ],
            stream=True
        )

        full_response = ""
        for chunk in response_stream:
            content = chunk.choices[0].delta.content
            if content:
                print(content, end='', flush=True)
                full_response += content

        # Add the sources at the end
        print("\n\n**Sources:**")
        for source_line in sources.values():
            print(source_line)

        print("\n" + "="*50 + "\n")
        return full_response

    except Exception as e:
        print(f"\nüõë An error occurred while communicating with the DeepSeek API: {e}")
        return None

print("‚úÖ Final application function 'ask_deepin_wiki' is ready to use.")



STEP 1: Setting up the DeepSeek LLM Client...
‚úÖ DeepSeek client initialized successfully.

STEP 2: Creating the prompt template...
‚úÖ Prompt template is ready.

STEP 3: Building the final 'ask' function...
‚úÖ Final application function 'ask_deepin_wiki' is ready to use.


In [8]:
# @title üöÄ Execute: Ask the Deepin Wiki!
# -----------------------------------------------------------------------------
# This is the final step. Simply enter your question about Deepin OS
# in the `my_question` variable and run this cell.
# -----------------------------------------------------------------------------

my_question = "‰ªÄ‰πàÊòØÊ∑±Â∫¶Â£ÅÁ∫∏Ôºü"

# Ask the question and get an answer from our RAG system
ask_deepin_wiki(my_question)


üîç Processing query: '‰ªÄ‰πàÊòØÊ∑±Â∫¶Â£ÅÁ∫∏Ôºü'
   - Step 1: Performing hybrid search...
   - Step 2: Reranking search results...
   - Step 3: Constructing prompt for LLM...
   - Step 4: Sending request to DeepSeek and streaming response...

Deepin OS ‰∏ìÂÆ∂ÂõûÁ≠îÔºö
Ê∑±Â∫¶Â£ÅÁ∫∏ÊòØDeepinÊìç‰ΩúÁ≥ªÁªü‰∏≠ÁöÑÊ°åÈù¢ËÉåÊôØÂõæÁâáÔºåÁî®‰∫é‰∏™ÊÄßÂåñÁîµËÑëÊ°åÈù¢„ÄÇÂÆÉÂèØ‰ª•ËÆ©ÁîµËÑëÁúãËµ∑Êù•Êõ¥Â•ΩÁúã„ÄÅÊõ¥ÊºÇ‰∫ÆÂíåÊõ¥Êúâ‰∏™ÊÄß„ÄÇ

Ê†πÊçÆÊñáÊ°£‰ø°ÊÅØÔºåÊ∑±Â∫¶Â£ÅÁ∫∏ÁöÑÁõ∏ÂÖ≥ÂÜÖÂÆπÂåÖÊã¨Ôºö
- Á≥ªÁªüÈªòËÆ§Â£ÅÁ∫∏ÁõÆÂΩïÔºö`/usr/share/backgrounds`
- Ê∑±Â∫¶Ê°åÈù¢ÁéØÂ¢ÉÂ£ÅÁ∫∏ÁõÆÂΩïÔºö`/usr/share/personalization/themes/Deepin/wallpapers`

Sources:
[Source 2]: Ê∑±Â∫¶Â£ÅÁ∫∏ (https://wiki.deepin.org/01_ËΩØ‰ª∂wiki/00_GUIËΩØ‰ª∂/01_deepinÂºÄÂèëÁöÑËΩØ‰ª∂/Ê∑±Â∫¶Â£ÅÁ∫∏)

**Sources:**
[1]: ËΩØ‰ª∂ÂåÖÂàÜÁ±ª‰∏éÁÆÄ‰ªã (https://wiki.deepin.org/ÂæÖÂàÜÁ±ª/02_deepinÊ∑±ÂÖ•/02_DDEÁõ∏ÂÖ≥/00_DDEÊ°åÈù¢ÁßªÊ§ç/ËΩØ‰ª∂ÂåÖÂàÜÁ±ª‰∏éÁÆÄ‰ªã)
[2]: Ê∑±Â∫¶Â£ÅÁ∫∏ (https://wiki.deepin.org/01_ËΩØ‰ª∂wiki/00_GUIËΩØ‰ª∂/01_deepinÂºÄÂèëÁöÑËΩØ‰

'Ê∑±Â∫¶Â£ÅÁ∫∏ÊòØDeepinÊìç‰ΩúÁ≥ªÁªü‰∏≠ÁöÑÊ°åÈù¢ËÉåÊôØÂõæÁâáÔºåÁî®‰∫é‰∏™ÊÄßÂåñÁîµËÑëÊ°åÈù¢„ÄÇÂÆÉÂèØ‰ª•ËÆ©ÁîµËÑëÁúãËµ∑Êù•Êõ¥Â•ΩÁúã„ÄÅÊõ¥ÊºÇ‰∫ÆÂíåÊõ¥Êúâ‰∏™ÊÄß„ÄÇ\n\nÊ†πÊçÆÊñáÊ°£‰ø°ÊÅØÔºåÊ∑±Â∫¶Â£ÅÁ∫∏ÁöÑÁõ∏ÂÖ≥ÂÜÖÂÆπÂåÖÊã¨Ôºö\n- Á≥ªÁªüÈªòËÆ§Â£ÅÁ∫∏ÁõÆÂΩïÔºö`/usr/share/backgrounds`\n- Ê∑±Â∫¶Ê°åÈù¢ÁéØÂ¢ÉÂ£ÅÁ∫∏ÁõÆÂΩïÔºö`/usr/share/personalization/themes/Deepin/wallpapers`\n\nSources:\n[Source 2]: Ê∑±Â∫¶Â£ÅÁ∫∏ (https://wiki.deepin.org/01_ËΩØ‰ª∂wiki/00_GUIËΩØ‰ª∂/01_deepinÂºÄÂèëÁöÑËΩØ‰ª∂/Ê∑±Â∫¶Â£ÅÁ∫∏)'