<a href="https://colab.research.google.com/github/1010836/portfolio/blob/main/hackathon/server/_projects/rag/LlamaRAGCegid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup

In [None]:
!pip install llama-index

In [None]:
!pip install torch --index-url https://download.pytorch.org/whl/cu117
!pip install langchain einops accelerate transformers bitsandbytes

# definitions

In [None]:
# Define variable to hold llama2 weights naming
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Set auth token variable from hugging face
auth_token = "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"

# model

In [None]:
# 1. Import transformer Auto to load
from transformers import AutoModelForCausalLM

# 2. Import torch for datatype attributes
import torch

# 3. Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir='./model/',
    token=auth_token,
    torch_dtype=torch.float16,
    rope_scaling={"type": "dynamic", "factor": 2},
    load_in_8bit=True
)

# tokenizer

In [None]:
from transformers import AutoTokenizer, TextStreamer

# 1. tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers),
# 2. adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece…),
# 3. managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization)
# 4. https://huggingface.co/transformers/v3.0.2/main_classes/tokenizer.html
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir='./model/',
    token=auth_token
)

# prompt

In [None]:
from llama_index.prompts.prompts import SimpleInputPrompt

In [None]:
# Import the prompt wrapper...but for llama index
# from llama_index.prompts.prompts import SimpleInputPrompt
# Create a system prompt
system_prompt = """[INST] <>
You are a helpful, respectful and honest assistant. Always answer as
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.

Your goal is to provide answers relating to the software CEGID Talentsof.<>
"""
# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

# Complete the query prompt
query_wrapper_prompt.format(query_str='hello')

# LLM

In [None]:
# Import the llama index HF Wrapper
from llama_index.llms import HuggingFaceLLM
# Create a HF LLM using the llama index wrapper
llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=256,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

# embendings

In [None]:
!pip install langchain
!pip install sentence_transformers

In [None]:
# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding
# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Create and dl embeddings instance
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)


# context

In [None]:
# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)
# And set the service context
set_global_service_context(service_context)

# documents

In [None]:
import requests
link = "https://huggingface.co/datasets/E1010836/rag/raw/main/cegid_talentsof_QeA.txt"
f = requests.get(link)
f.encoding = 'UTF-16'
documents = f.text
with open('q&a.txt', 'w') as f:
    f.write(documents)

In [None]:
from pathlib import Path
from llama_index import download_loader

UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)
loader = UnstructuredReader()
documents = loader.load_data(file=Path('./q&a.txt'))

print(documents)

# index

In [None]:
from llama_index import VectorStoreIndex

# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

# API

In [None]:
!pip install pyngrok
!pip install flask
!pip install -U flask-cors

In [None]:
# @title
import os
import threading
from pyngrok import ngrok
from flask import Flask
from flask import jsonify
from flask import request
from flask_cors import CORS
import json

!ngrok config add-authtoken 2V1aDZKTFokAKx9aIuIp6l6GbYz_7mVxXhFHMcqs1RZ9B4RVE

app = Flask(__name__)
CORS(app)
port = 5000

# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

# Your API endpoint
url = 'http://51.20.133.14/store'

# Your data to send, this is an example, replace with your actual data
data = {
    "key": "address",
    "value": public_url
}

# Convert the data to JSON format
data_json = json.dumps(data)

# Your headers
headers = {
    'Content-Type': 'application/json',
}

# Send the POST request
response = requests.post(url, data=data_json, headers=headers)

# Print the response
print(response.json())

# Update any base URLs to use the public ngrok URL
app.config["BASE_URL"] = public_url

# Define Flask routes
@app.route("/")
def text():
    question = request.args.get('question')

    # global index
    query_engine = index.as_query_engine()
    response = query_engine.query(question)

    response = str(response)
    cleanResponse = response.replace('"', '')
    cleanResponse = cleanResponse.replace('\n\n', '')

    print("")
    print("-------------------------------------------------------")
    print(cleanResponse)
    print("-------------------------------------------------------")

    result = jsonify({"answer": cleanResponse})

    print("")
    print("-------------------------------------------------------")
    print(result)
    print("-------------------------------------------------------")

    return result

# Start the Flask server in a new thread
# threading.Thread(target=app.run, kwargs={"use_reloader": False}).start()
app.run()