In [1]:
from docs_preprocess import DocumentProcessor
from VecStore import VectorStore
from langchain.llms import HuggingFacePipeline
from pathlib import Path


In [2]:
from typing import List, Optional, Any
from langchain_core.language_models.llms import LLM

class DeepSeekLLM(LLM):
    model: Any
    tokenizer: Any
    max_new_tokens: int = 512
    temperature: float = 0.0           # 默认给 RAG 用

    @property
    def _llm_type(self) -> str:
        return "deepseek_hf"

    def _call(self, prompt: str,
              stop: Optional[List[str]] = None,
              **kwargs) -> str:
        # 1. 把普通 prompt 包装进官方 chat_template
        messages = [{"role": "user", "content": prompt}]
        input_ids = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(self.model.device)

        # 2. 生成
        outputs = self.model.generate(
            input_ids,
            max_new_tokens=kwargs.get("max_new_tokens", self.max_new_tokens),
            temperature=kwargs.get("temperature", self.temperature),
        )
        text = self.tokenizer.decode(
            outputs[0][input_ids.shape[1]:],
            skip_special_tokens=True,
        )

        # 3. 手动处理 stop tokens
        if stop:
            for s in stop:
                if s in text:
                    text = text.split(s)[0]

        return text.strip()


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_dir = "/home/lyus4/yuheng/All_in_LLM/deepseek-llm-7b-chat"
tok   = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir,
                                             trust_remote_code=True,
                                             torch_dtype="auto",
                                             device_map="auto")



  from .autonotebook import tqdm as notebook_tqdm


[2025-06-14 13:04:12,416] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/lyus4/anaconda3/envs/rag_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/lyus4/anaconda3/envs/rag_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [4]:
llm = DeepSeekLLM(model=model, tokenizer=tok, max_new_tokens=512, temperature=0.2)
print(llm("你好，请自我介绍一下。"))

  print(llm("你好，请自我介绍一下。"))
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


你好！我是一个人工智能助手，名为DeepSeek Chat。我由中国的DeepSeek团队开发，旨在帮助用户解答问题、提供信息和执行各种任务。我能够处理多种主题，包括但不限于科学、数学、历史、文化、技术等。如果你有任何问题，欢迎随时向我提问。


In [5]:
print(llm("In Wireless communication, what is RB"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


In wireless communication, RB stands for Resource Block. It is a basic unit of data in the GSM (Global System for Mobile communication) and UMTS (Universal Mobile Telecommunication System) standards. A Resource Block contains information about the data to be transmitted, such as the data itself, error-correction codes, and synchronization information.

Resource Blocks are used to encapsulate and transmit data in the air interface of wireless communication systems. They are also used to allocate resources, such as bandwidth and power, to different users or services.

In GSM, a Resource Block is typically 128 bytes in size, while in UMTS, it can be up to 64 KB. The size of a Resource Block can vary depending on the specific requirements of the communication system and the type of data being transmitted.


In [6]:
# 路径配置
MODEL_PATH =  "/home/lyus4/yuheng/All_in_LLM/all-MiniLM-L6-v2"
INDEX_PATH = Path("../vector_store/faiss_ivfflat_100")  # 无需加 .index 后缀
VECTORS_PATH = Path("../index/vectors.npy")
DOC_JSON_PATH = Path("output_chunks.json")

In [7]:
vs = VectorStore(model_path=MODEL_PATH, db_path=INDEX_PATH)
vs.load_documents_and_metadata(json_path=DOC_JSON_PATH)
vs.describe()


[INFO] 初始化 VectorStore -> ../vector_store/faiss_ivfflat_100
[INFO] 索引加载成功: ../vector_store/faiss_ivfflat_100
[INFO] 类型: IndexIVFFlat, 维度: 384, 数量: 18343
[INFO] 加载文档 18343 条

[INFO] VectorStore 状态描述：
- 文档数: 18343
- 向量数: 0
- 索引类型: IndexIVFFlat
- 向量维度: 384
- 向量总数: 18343


In [8]:

results = vs.search("what is OFDMA?", k=5, score_mode="reciprocal")

  self.model = HuggingFaceEmbeddings(
No sentence-transformers model found with name /home/lyus4/yuheng/All_in_LLM/all-MiniLM-L6-v2. Creating a new one with mean pooling.


[INFO] 嵌入模型已加载


In [9]:
results

[('Orthogonal frequency division multiple access (OFDMA), which is the multi-user\nversion of OFDM that was discussed in Section 9.12. In OFDMA multiple access is\naccomplished through the assignment of subchannels (subcarriers) to individual users.',
  0.6585590018011712,
  {'document': '../documents/Digital-Communication-Systems.pdf',
   'page': 587,
   'chunk_idx': 5}),
 ('Chapter\nSignaling over Fading Channels\nNaturally, OFDMA inherits the distinctive features of OFDM. In particular, OFDMA\nis well suited for high data-rate transmissions over delay-dispersive channels, realized\nby exploiting the principle of divide and conquer. Accordingly, OFDMA is\ncomputationally efficient in using the FFT algorithm. Moreover, OFDMA lends itself\nto the combined use of MIMO, hence the ability to improve spectral efficiency and\ntake advantage of channel flexibility.\nCode-division multiple access (CDMA), which distinguishes itself by exploiting the\nunderlying principle of spread spectrum sig

In [10]:
from langchain_core.prompts import PromptTemplate
from utils import format_context_grouped, expand_acronyms
# 定义 Prompt 模板

template_test = """
<Role>
You are a 5G wireless communication expert.

<Goal>
Answer the question using the information in the context below.
If the context is insufficient, reply exactly: **"I don't know"**.

<Context>
{context}

<Question>
{question}

<Instructions>
1. Explain simply and clearly, as if to a non-expert.  
2. Use the following **answer format strictly**:  
   - Start with a 1-2 sentence summary.  
   - Then, list the explanation in **numbered bullet points** (1., 2., 3., etc.).  
3. Give the reference.
4. Conform to 3GPP protocol.

<Answer>

"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template_test
)

# 执行检索
question = "What is beam management?"
question = expand_acronyms(question)

top_k_results = vs.search(question, k=3, score_mode="reciprocal")
context = format_context_grouped(top_k_results, with_metadata=True, with_score=True)

# 构造 Prompt 输入
prompt_input = prompt.format(context=context, question=question)

# 输出 prompt（可传入 LLM）
print(prompt_input)





<Role>
You are a 5G wireless communication expert.

<Goal>
Answer the question using the information in the context below.
If the context is insufficient, reply exactly: **"I don't know"**.

<Context>
[Document: ../documents/3GPP_38/38214-i60.docx, Page: 1]
(score=0.495) to the UE reported threshold beamSwitchTiming when the reported value is one of the values {14,28,48} and when enableBeamSwitchTiming is not provided or the NZP-CSI-RS-ResourceSet is configured with the higher layer parameter trs-Info , aperiodic CSI-RS in a NZP-CSI-RS-ResourceSet configured with the higher layer parameter repetition set to 'off' or configured without the higher layer parameters repetition and trs-Info scheduled with offset larger than or equal to 48 when the UE provides beamSwitchTiming-r16 and enableBeamSwitchTiming is provided, aperiodic CSI-RS in a NZP-CSI-RS-ResourceSet configured with the higher layer parameter repetition set to 'on' scheduled with offset larger than or equal to the UE reported 

In [11]:
response = llm.invoke(prompt_input)

     

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


In [12]:
print(response)

Beam management refers to the process of controlling and optimizing the use of beams in a 5G wireless communication system. This includes tasks such as beamforming, beam switching, and beam allocation. Beamforming is the process of directing radio signals towards a specific user or group of users by using multiple antennas at the transmitter and receiver. Beam switching is the process of switching between different beams to improve signal quality or to avoid interference from other beams. Beam allocation is the process of assigning users to specific beams based on their location and the available beams in the system. Beam management is an important aspect of 5G wireless communication as it helps to improve the efficiency and performance of the system by optimizing the use of radio resources.

*Beam management is a process that involves controlling and optimizing the use of beams in a 5G wireless communication system.*

*Beamforming is the process of directing radio signals towards a sp

In [13]:
response_raw = llm.invoke("In 5G wireless communications, What is beam management?")
print(response_raw)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


Beam management is a key technology in 5G wireless communications that allows for the efficient use of radio frequency (RF) resources and improved network performance. It involves the use of multiple antennas at the base station and the mobile device to create a beam of radio waves that is directed towards the device, rather than broadcasting in all directions. This allows for a more focused and efficient use of RF resources, which can lead to improved data rates, reduced latency, and increased capacity in the network. Beam management can also be used to improve the signal quality for devices that are located in areas with poor signal strength, by directing the beam towards the device and reducing interference from other sources.


In [27]:
# ── 依赖 ─────────────────────────────────────────────────────────────
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import (
    Runnable, RunnableLambda, RunnableWithMessageHistory
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.chat_history import InMemoryChatMessageHistory
import textwrap
from utils import format_context_grouped, expand_acronyms

# 你的llm和vs初始化略

# ── 1. Prompt 模板 ──────────────────────────────────────────────
template_test = """
<Role>
You are a 5G wireless communication expert.

<Goal>
Answer the question using the information in the context below.
If the context is insufficient, reply exactly: **"I don't know"**.

<Context>
{context}

<Question>
{question}

<Instructions>
1. Explain simply and clearly, as if to a non-expert.  
2. Give the reference.

<Answer>

"""

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template_test),
    MessagesPlaceholder(variable_name="history"),  # 多轮历史插入
    ("user", "<Question>\n{question}")
])

# ── 2. 构造上下文（支持历史问句合并） ─────────────────────────────
def build_inputs(inputs: dict):
    question = inputs["input"]
    history_msgs = inputs.get("history", [])

    # 合并历史中所有 HUMAN 消息作为检索 query
    history_text = "\n".join([msg.content for msg in history_msgs if getattr(msg, "type", getattr(msg, "role", "")).lower() in ["human", "user"]])
    combined_query = f"{history_text}\n{question}" if history_text else question
    q_exp = expand_acronyms(combined_query)

    # 检索上下文
    top_k_results = vs.search(q_exp, k=3, score_mode="reciprocal")
    ctx = format_context_grouped(top_k_results, with_metadata=True, with_score=True)

    # 传递prompt模板要求的变量名
    output = dict(inputs)
    output.update({
        "context": ctx,
        "question": question
    })
    return output

context_retriever = RunnableLambda(build_inputs)

# ── 3. 构建完整对话链 ─────────────────────────────────────────────
base_chain: Runnable = (
    context_retriever |
    chat_prompt |
    llm |
    StrOutputParser()
)

# ── 4. 历史工厂（支持缓存）────────────────────────────────────────
history_store = {}

def history_factory(session_id: str):
    if session_id not in history_store:
        history_store[session_id] = InMemoryChatMessageHistory()
    return history_store[session_id]

# ── 5. 构建带历史支持的 chatbot ─────────────────────────────────
chatbot = RunnableWithMessageHistory(
    base_chain,
    history_factory,
    input_messages_key="input",      # 本轮问题字段
    history_messages_key="history"   # 多轮消息字段
)

MAX_HISTORY_MESSAGES = 6

def trim_history(session_id: str, max_messages=MAX_HISTORY_MESSAGES):
    history = history_factory(session_id).messages
    if len(history) > max_messages:
        history[:] = history[-max_messages:]

# ── 6. 示例调用（支持多轮）──────────────────────────────────────
session_id = "user_42"

def wrap_text(text, width=120):
    return textwrap.fill(text, width=width)

def print_qa_round(question: str, response: str):
    print("\n====== 对话回合 ======")
    print(f"[🧑‍💬] {wrap_text(question)}")
    print(f"[🤖] {wrap_text(response)}")

def print_chat_history(session_id: str):
    print("\n====== 历史记录 ======")
    history = history_factory(session_id).messages
    for i in range(0, len(history), 2):
        q = wrap_text(history[i].content) if i < len(history) else ""
        a = wrap_text(history[i+1].content) if i+1 < len(history) else ""
        print(f"\n🧑‍💬 Q{i//2 + 1}: {q}")
        print(f"🤖 A{i//2 + 1}: {a}")

# 第 1 轮
question1 = "What is beam management?"
response1 = chatbot.invoke(
    {"input": question1},
    config={"configurable": {"session_id": session_id}}
)
trim_history(session_id)
print_qa_round(question1, response1)

# 第 2 轮
question2 = "And why is it important?"
response2 = chatbot.invoke(
    {"input": question2},
    config={"configurable": {"session_id": session_id}}
)
trim_history(session_id)
print_qa_round(question2, response2)

# 打印历史问答
print_chat_history(session_id)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



[🧑‍💬] What is beam management?
[🤖] Beam management refers to the process of controlling and optimizing the use of beams in a 5G wireless communication
system. This includes tasks such as configuring and managing beam configurations, adjusting beam patterns, and managing
beam switching between different cells or serving areas. The goal of beam management is to improve the overall
performance and efficiency of the communication system by ensuring that signals are transmitted and received as
effectively as possible.  <Reference> The concept of beam management is not directly mentioned in the provided context
documents. However, it is related to the overall management of beams in a 5G system, which is discussed in more detail
in the "Fundamentals of Wireless Communication" document.

[🧑‍💬] And why is it important?
[🤖] AI: Beam management is important because it helps to optimize the use of beams in a 5G wireless communication system,
which can improve the overall performance and efficienc