# RAG & Faiss & LLaMA 3.0

## pdf 파일에서 string 추출

In [7]:
import pdfplumber
import streamlit as st

In [8]:
# pdf 파일에서 텍스트 추출하기 (입력 : pdf 파일 경로)
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")


if uploaded_file is not None:
    # PDF 파일 열기
    with pdfplumber.open(uploaded_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    
    st.write("Extracted Text:", text)
    model_input = text

2024-07-23 11:51:46.009 
  command:

    streamlit run /opt/anaconda3/envs/llama3rag/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [3]:
import gradio as gr
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import ollama

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
# Function to load, split, and retrieve documents
def load_and_retrieve_docs(url):
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict() 
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    return vectorstore.as_retriever()

# Function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Function that defines the RAG chain
def rag_chain(url, question):
    retriever = load_and_retrieve_docs(url)
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)
    formatted_prompt = f"Question: {question}\n\nContext: {formatted_context}"
    response = ollama.chat(model='llama3', 
                           messages=[
                                {"role": "system",
                                 "content": "You are a helpful assistant. Check the url content and answer the question. Translate the answer in Korean with emoji."
                                },
                                {"role": "user", "content": formatted_prompt}])
    return response['message']['content']

In [5]:
 # Gradio interface
iface = gr.Interface(
    fn=rag_chain,
    inputs=["text", "text"],
    outputs="text",
    title="LLAMA 3: RAG Chain Question Answering",
    description="Enter a URL and a query to get answers from the RAG chain."
)

# Launch the app
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [9]:
import gradio as gr
import pdfplumber
from transformers import LlamaForCausalLM, LlamaTokenizer

# 모델 및 토크나이저 초기화
model_name = "decapoda-research/llama-13b"
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)

def process_pdf(pdf):
    # PDF 파일에서 텍스트 추출
    with pdfplumber.open(pdf) as pdf_file:
        text = ""
        for page in pdf_file.pages:
            text += page.extract_text()
    
    # 텍스트를 모델 입력으로 사용
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return result

# Gradio 인터페이스 설정
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.inputs.File(label="Upload PDF", type="file"),
    outputs="text",
    title="PDF to Text with Llama",
    description="Upload a PDF file and get the generated response using Llama model."
)

iface.launch()

ModuleNotFoundError: No module named 'transformers'

In [64]:
paragraphs = []
paragraphs.append(pdf_text[:1000])
paragraphs.append(pdf_text[1000:2020])

In [65]:
print(paragraphs[1])

 Strong command-line skills in Linux OS for system administration and software deployment
& Web Dev • Proficient in Git/Github for streamlined version control
• Experienced in full-stack web development processes using HTML/CSS/Boostrap5/JavaScript/Flask
English • Fluent in both written and spoken English, confident to engage in international activities
& Soft Skills • A people person - ensuring happy customers, whilst working well with internal teams
Certifications
TOEIC (score : 970) Mar 16, 2024 YBM TOEIC
OPIC (level : AL) Feb 05, 2024 ACTFL OPIc
AWS CCP (Certified Cloud Practitioner) May 04, 2024 AWS (Amazon Web Service)
Elementary Teacher Certification 1st : Feb 16, 2022 1st : Seoul Metropolitan Office of Education
(1st & 2nd Grade Certified Teacher) 2nd : Feb 24, 2017 2nd : Seoul National University of Education
Blogs
https://shorturl.at/ Fluent in English and dedicated to maintaining a global perspective, I
🌎 Notion (English)
cixX1 pursued all my courses and curated my Notion ex

## 텍스트 임베딩 생성

In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = LlamaForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")


In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states

    return hidden_states[-1].mean(dim=1).detach().numpy()

## FAISS 벡터 데이터베이스에 임베딩 벡터 추가

In [None]:
import faiss
import numpy as np

embeddings = [get_embedding(paragraph) for paragraph in paragraphs]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [55]:
embeddings = np.vstack(embeddings)

embeddings

array([[-0.01163484, -0.03499083,  0.13591035, ...,  0.14383522,
         0.02660656, -0.7701082 ],
       [-0.5290672 , -0.6349337 ,  0.7247939 , ..., -0.6703838 ,
         0.392003  , -0.1834573 ]], dtype=float32)

In [57]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

dimension, index, embeddings

(4096,
 <faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x1094af180> >,
 array([[-0.01163484, -0.03499083,  0.13591035, ...,  0.14383522,
          0.02660656, -0.7701082 ],
        [-0.5290672 , -0.6349337 ,  0.7247939 , ..., -0.6703838 ,
          0.392003  , -0.1834573 ]], dtype=float32))

## 검색

In [58]:
def search_question(question, index, model, k=3):
    question_embedding = model.encode(question)
    question_embedding = np.array([question_embedding]).astype('float32')
    distances, indices = index.search(question_embedding, k)
    return indices

In [61]:

def get_question_embedding(question, tokenizer, model):
    inputs = tokenizer(question, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # Assuming you want to use the last layer hidden state as the embedding
    hidden_states = outputs.hidden_states[-1]
    embeddings = hidden_states.mean(dim=1).squeeze().numpy()
    return embeddings


# Define the search function
def search_question(question, index, tokenizer, model, k=3):
    question_embedding = get_question_embedding(question, tokenizer, model)
    question_embedding = np.array([question_embedding]).astype('float32')
    distances, indices = index.search(question_embedding, k)
    return indices


# Example usage
question = "What certifications does Seul Kim have?"
top_k_indices = search_question(question, index, tokenizer, model)
top_k_paragraphs = [paragraphs[i] for i in top_k_indices[0]]


In [14]:
import gradio as gr
import pdfplumber
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import ollama

# Function to load, split, and retrieve documents from a PDF file
def load_and_retrieve_docs(file):
    text = ""
    try:
        with pdfplumber.open(file) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        return f"Error reading PDF file: {e}"
    
    if not text:
        return "No text found in the PDF file."
    
    docs = [Document(page_content=text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    return vectorstore.as_retriever()

# Function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Function that defines the RAG chain
def rag_chain(file, question):
    retriever = load_and_retrieve_docs(file)
    if isinstance(retriever, str):  # If an error message is returned
        return retriever
    
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)
    formatted_prompt = f"Question: {question}\n\nContext: {formatted_context}"
    response = ollama.chat(model='llama3', 
                           messages=[
                                {"role": "system",
                                 "content": "You are a helpful assistant. Check the pdf content and answer the question."
                                },
                                {"role": "user", "content": formatted_prompt}])
    return response['message']['content']

# Gradio interface
iface = gr.Interface(
    fn=rag_chain,
    inputs=["file", "text"],
    outputs="text",
    title="LLAMA 3: RAG Chain Question Answering",
    description="Upload a PDF and enter a query to get answers from the RAG chain."
)

# Launch the app
iface.launch()


Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




In [None]:
with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text

In [62]:
top_k_paragraphs

['Seul Kim\nDesired position Data Scientist / Data Analyst (0 year experience)\nEmail niceonesuri@gmail.com\nSelf-driven learner\nProactive problem solver\nMobile (+82) 010-4415-3388\nSolid team player\nStrength\nCloud Expertise • Certified AWS Cloud Practitioner with hands-on experience in various AWS services\n• Planning to be certified as the AWS Certified Data Engineer - Associate by early June\nDatabase • Advanced proficiency in MySQL database management\nManagement • Deep understanding of the differences between various AWS database services, such as DynamoDB,\nAurora, and RDS, enabling informed decisions on selecting the most appropriate service based on\nspecific use cases and requirements\nData Analysis • Confident in applying AI/ML techniques to solve problems using TensorFlow & PyTorch\n& Big Data • Skilled in handling large datasets and conducting complex data analysis and visualisation, using R,\nPython, and Tableau, deriving actionable insights and enabling data-driven de