In [1]:
import os
import getpass
import sqlite3
import pandas as pd
import time
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain_chroma import Chroma
import chromadb
import yaml

In [2]:
conn = sqlite3.connect("data/database.db")
tickers = pd.read_sql_query("SELECT distinct Security FROM master_ticker", conn)["Security"].tolist()

In [3]:
with open("keys.yaml") as keys:
    try:
        api_keys = yaml.safe_load(keys)
    except yaml.YAMLError as exc:
        print(exc)

In [4]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_keys["hf_model"]

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    temperature=0,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)


  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\feder\.cache\huggingface\token
Login successful



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.llms.huggingface_text_gen_inference import (  # type: ignore[import-not-found]


### PDF Loader

In [5]:
loader = PyPDFDirectoryLoader("data/wikipedia/")
docs = loader.load()

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())



In [7]:
from langchain import hub
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_name = 'HuggingFaceH4/zephyr-7b-beta'
bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_use_double_quant=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)


`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 8/8 [00:27<00:00,  3.43s/it]


In [1]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain.chains import LLMChain

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}


<|user|>
{question}

<|assistant|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = LLMChain(llm=llm, prompt=prompt)

# Step 4: Query the vectorstore for relevant context
def get_relevant_context(question, top_k=3):
    """
    Perform similarity search to retrieve the most relevant chunks from the vectorstore.
    """
    results = vectorstore.similarity_search(question, k=top_k)
    context = "\n\n".join([result.page_content for result in results])
    return context

# Step 5: Integrate everything for answering questions
def ask_question(question):
    """
    Retrieve relevant context from the vectorstore and use the LLMChain to answer the question.
    """
    context = get_relevant_context(question)
    result = llm_chain.run({"context": context, "question": question})
    return result

# Example usage
question = "What is AES?"
answer = ask_question(question)
print(answer)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'model' is not defined

: 

In [16]:
def get_db_schema(connection):
    """
    Extract the database schema as a string.
    """
    schema = []
    cursor = connection.cursor()
    for table in cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall():
        table_name = table[0]
        schema.append(f"Table: {table_name}")
        columns = cursor.execute(f"PRAGMA table_info({table_name});").fetchall()
        for col in columns:
            schema.append(f"    Column: {col[1]} ({col[2]})")
    return "\n".join(schema)

schema = get_db_schema(conn)


In [None]:
db_schema_prompt = """
<|system|>
You are an assistant that translates user questions into SQL queries for an SQLite database. 
The database schema is as follows:
Price database: contains all prices from SP500 companies.
Income statements: contains financial information from SP500 companies.

{schema}

<|user|>
{question}

<|assistant|>
Here is the corresponding SQL query:
"""


In [27]:
def text_to_sql(question):
    """
    Generate and execute a SQL query for the given question.
    """
    # Prepare the inputs for the LLMChain
    inputs = {
        "context": schema,  # Provide the database schema as context
        "question": question
    }
    
    # Generate the SQL query
    sql_query = llm_chain.run(inputs)
    
    # Execute the SQL query on the database
    try:
        cursor = conn.cursor()
        cursor.execute(sql_query)
        result = cursor.fetchall()
        return {"query": sql_query, "result": result}
    except sqlite3.Error as e:
        return {"query": sql_query, "error": str(e)}


In [None]:
# Example usage
question = "List all securities with a market value greater than $1,000,000."
output = text_to_sql(question)

print("Generated SQL Query:")
print(output["query"])

if "error" in output:
    print("Error executing query:", output["error"])
else:
    print("Query Result:")
    for row in output["result"]:
        print(row)




Generated SQL Query:

<|system|>
Answer the question based on your knowledge. Use the following context to help:

Table: income_statements
    Column: date (TEXT)
    Column: symbol (TEXT)
    Column: reportedCurrency (TEXT)
    Column: cik (TEXT)
    Column: fillingDate (TEXT)
    Column: acceptedDate (TEXT)
    Column: calendarYear (TEXT)
    Column: period (TEXT)
    Column: revenue (INTEGER)
    Column: costOfRevenue (INTEGER)
    Column: grossProfit (INTEGER)
    Column: grossProfitRatio (REAL)
    Column: researchAndDevelopmentExpenses (INTEGER)
    Column: generalAndAdministrativeExpenses (INTEGER)
    Column: sellingAndMarketingExpenses (INTEGER)
    Column: sellingGeneralAndAdministrativeExpenses (INTEGER)
    Column: otherExpenses (INTEGER)
    Column: operatingExpenses (INTEGER)
    Column: costAndExpenses (INTEGER)
    Column: interestIncome (INTEGER)
    Column: interestExpense (INTEGER)
    Column: depreciationAndAmortization (INTEGER)
    Column: ebitda (INTEGER)
    Col

: 