<a href="https://colab.research.google.com/github/Anand-G-Murugan/LLM-document-QA/blob/main/Llama_Langchain_CSV_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSV QA Bot using OpenAI Langchain and LlamaIndex

* The program uses LlamaIndex to read and index the data.
* We then send it to OpenAI Embeddings to get the embedings for the data.
* The data is then used to make a query engine that allows us to query the data and get OpenAI GPT - driven responses about our CSV file!

Dataset used as example: https://www.kaggle.com/datasets/geomack/spotifyclassification

Note: I have taken only the first 30 rows to save on tokens.

## How to use

* Upload your data to Google Colab as data.csv
* Copy and paste the code into your own Colab Notebook
* Update the OpenAI API Key in your code with your own API Key
* Replace the question variable with your own question
* Thats it! you're done.

In [None]:
import os
os.environ["OPENAI_API_KEY"] = 'YOUR_API_KEY_HERE'

In [None]:
! pip install -q llama-index langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m574.7/574.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pathlib import Path
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from llama_index import (LLMPredictor, ServiceContext, download_loader, GPTVectorStoreIndex, LangchainEmbedding)

In [None]:
# Getting the CSV Loader from llamaindex (refer: https://llama-hub-ui.vercel.app)
PagedCSVReader = download_loader("PandasCSVReader")
loader = PagedCSVReader()

In [None]:
documents = loader.load_data("./data.csv")

In [None]:
# checking the first element to see what it looks like
documents[0]

Document(id_='84aafb70-9323-4a30-aec6-c8001e4f2071', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='c97b3284dc4d276ad313e41280f8fc0cb36087e05bb6a5a02f10a3078225f9d2', text="0, 0.0102, 0.833, 204600, 0.434, 0.0219, 2, 0.165, -8.795, 1, 0.431, 150.062, 4.0, 0.286, 1, Mask Off, Future\n1, 0.199, 0.743, 326933, 0.359, 0.00611, 1, 0.137, -10.401, 1, 0.0794, 160.083, 4.0, 0.588, 1, Redbone, Childish Gambino\n2, 0.0344, 0.838, 185707, 0.412, 0.000234, 2, 0.159, -7.148, 1, 0.289, 75.044, 4.0, 0.173, 1, Xanny Family, Future\n3, 0.604, 0.494, 199413, 0.338, 0.51, 5, 0.0922, -15.236, 1, 0.0261, 86.468, 4.0, 0.23, 1, Master Of None, Beach House\n4, 0.18, 0.678, 392893, 0.561, 0.512, 5, 0.439, -11.648, 0, 0.0694, 174.004, 4.0, 0.904, 1, Parallel Lines, Junior Boys\n5, 0.00479, 0.804, 251333, 0.56, 0.0, 8, 0.164, -6.682, 1, 0.185, 85.023, 4.0, 0.264, 1, Sneakin’, Drake\n6, 0.0145, 0.739, 241400, 0.472, 7.27e-06, 1, 0.207, -11.204,

In [None]:
# Initializing OpenAI LLM
openai = OpenAI(model_name="text-davinci-003")

# Alternate model
# openai = OpenAI(model_name="gpt-3.5-turbo")

In [None]:
# Initialize the Preictor for LlamaIndex
llm_predictor = LLMPredictor(llm=openai)

In [None]:
# Create the embeddings using OpenAI Embeddings
embeddings = LangchainEmbedding(OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]))

In [None]:
# Creating the service context for LlamaIndex
service_context = ServiceContext.from_defaults(
            llm_predictor=llm_predictor,
            chunk_size=400,
            embed_model=embeddings)

In [None]:
# Indexing the data
index = GPTVectorStoreIndex.from_documents(
    documents,
    service_context=service_context
    )

In [None]:
# Creating a Llamaindex Query Engine
query_engine = index.as_query_engine(
            verbose=True,
            llm_predictor=llm_predictor
            )

In [None]:
# Function that performs the query operation
def query(payload):
    response = query_engine.query(payload["inputs"]["text"])
    return str(response).lstrip("\n")

In [None]:
# Input your question here
# question = "YOUR_QUESTION_HERE"
# question = "Name 3 music artists"
# question = "Name a song by the artist Future"
question = "Who made the song Redbone?"

In [None]:
# Run this cell to get the answer to your question with the help of GPT!
output = query({"inputs": {"text": question, }})
print(output)

Childish Gambino
