## 0.Install

### install python library

In [1]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph langchain-ollama langchain-huggingface beautifulsoup4 sentence-transformers langchain langchain-openai ipywidgets jupyter

^C
Note: you may need to restart the kernel to use updated packages.


restart the kernel

Download [Ollama](https://ollama.com/) and run it


### LangSmith(Optional)

In [8]:
import os

LANGCHAIN_TRACING_V2='true'
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_API_KEY="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"# Place your API Key
LANGCHAIN_PROJECT="RAG_SELAB"

os.environ["LANGCHAIN_TRACING_V2"] = LANGCHAIN_TRACING_V2
os.environ["LANGCHAIN_ENDPOINT"] = LANGCHAIN_ENDPOINT
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY 
os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT


## 1.Create LLM

In [9]:
from langchain_ollama import OllamaLLM
# model = "llama3.2"
model = "mistral"
llm = OllamaLLM(model=model, base_url="http://localhost:11434")


In [None]:
messages = "我在測試專案"
# messages = [
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "你好，請問你能幫我什麼？"}
# ]
response = llm.invoke(messages)

print(response)

## 2.Create Embeddings model

English:  
sentence-transformers/all-MiniLM-L6-v2  
intfloat/multilingual-e5-small  


chinese model([https://ihower.tw/blog/archives/12167](https://ihower.tw/blog/archives/12167)):  
BAAI/bge-m3  

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-m3"
embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [11]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

## 3.Load Doc

### Web

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
print(docs)

### PDF

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("docs/SElab_Industry_Academia_Collaboration.pdf")
docs = loader.load()
print(docs)

[Document(metadata={'source': 'docs/SElab_Industry_Academia_Collaboration.pdf', 'page': 0}, page_content='實驗室簡介\n簡短介紹\n我們是軟體⼯程實驗室，主要在做軟體測試相關的研究，包含  Android 爬蟲測試、網⾴\n表單爬蟲測試、對 RESTful API 做模糊測試以及微服務等\n另外也還有和振興醫院合作的醫療影像相關研究，分別是⼼臟彩⾊超⾳波和主動脈，是\n和尤信程教授共同指導的。\n⽬前實驗室多數研究都在結合  LLM 來進⾏改良\n各項研究說明\nAndroid 爬蟲測試 ( 建宏老師 )\n⽬的：\n現今有越來越多  Android 程式被開發出來，如何確保程式品質，測試是不可或缺的⼀環\nACE\n我們先前開發出了⼀款名為  ACE (Android CrawlEr) 的  Android GUI ⾃動化測試⼯具\nACE 主要透過爬⾏  GUI (Graphical User Interface) 畫⾯⾃動探索應⽤程式\n⽀援不同的爬⾏策略，包含強化學習和好奇⼼驅動兩種演算法\nAAD\nAAD (Android Anomaly Detector) 則是延伸  ACE ，使⽤從 ACE 爬⾏產⽣的路徑進⾏操作注\n入\n負責偵測  APP 發⽣  Anomaly （異常）的情況\nAnomaly 是指程式⾏為和預期 (Expect) 不同\n例如：我們預期程式在來回翻轉畫⾯前後⾏為 ( 畫⾯ ) 相同\n但有些程式因為開發時的缺失，造成翻轉畫⾯後對話框消失\n使⽤像是來回翻轉畫⾯或是離開  APP 再進去等⽅式來偵測\nATAS\nATAS (Android Test Automation Service) 是⼀個測試暨服務平台\n整合  ACE 、 AAD 與  Android Emulator 並進⾏容器化\n讓測試⼯具在無需實體機器的情況下，即可進⾏  APP 測試\n2025/1/10 下午 4:20 實驗室簡介  - HackMD\nhttps://hackmd.io/@fNmP5OVcSRmw0KivmULuHg/B1-cFT6gkl 3/10'), Document(metadata={'source': 'docs/SEla

### Split doc and create graph

In [12]:
from langchain import hub
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

## 4. Ask question

In [14]:
response = graph.invoke({"question": "實驗室的產學合作有哪些?"})
print(response["answer"])

實驗室有Sunbird全端開發、Sunbird效能測試計劃、助教等產業合作。新生會在8月中 ~ 底進實驗室，之後可以參與每周一次的論文Group Meeting。碩論題主要接受學費姐的研究，且需要有定義的程式基礎。

在實驗室，學生必須完成作業，若遇到問題可以來信询问学长，我们会提供相应的建议。實驗室安排上，碩一期主要修課，視需求參與產業合作。
