In [None]:
# 1. Install dependencies (Colab cell)
!pip install langchain langchain-community sentence_transformers faiss-cpu ipywidgets pandas

In [None]:
# 2. Imports
import pandas as pd
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
# 3. Hugging Face API Token (REQUIRED for HuggingFaceHub LLMs)
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [4]:
# 4. CSV Upload and Preprocessing
import io
upload = widgets.FileUpload(accept='.csv', multiple=False)
display(upload)

def get_dataframe_from_upload(upload_widget):
    if not upload_widget.value:
        return None
    file_info = next(iter(upload_widget.value.values()))
    content = io.BytesIO(file_info['content'])
    try:
        df = pd.read_csv(content)
    except Exception:
        content.seek(0)
        df = pd.read_csv(content, delimiter=';')
    return df

# Wait for upload
import time
while not upload.value:
    time.sleep(1)
df = get_dataframe_from_upload(upload)
print("CSV loaded. Shape:", df.shape)
display(df.head())


FileUpload(value={}, accept='.csv', description='Upload')

KeyboardInterrupt: 

In [None]:
# 5. Convert CSV to Documents
def df_to_documents(df):
    docs = []
    for idx, row in df.iterrows():
        text = "; ".join([f"{col}: {row[col]}" for col in df.columns])
        docs.append(text)
    return docs

documents = df_to_documents(df)

In [None]:
# 6. Chunking (optional, for very wide/long rows)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = []
for doc in documents:
    docs.extend(text_splitter.split_text(doc))

In [None]:
# 7. Embedding and Vector Store
embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vectorstore = FAISS.from_texts(docs, embedding=embed_model)

In [None]:
# 8. LLM Setup (Hugging Face public chat LLM)
llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",  # You may use mistralai/Mistral-7B-Instruct-v0.2 or meta-llama/Meta-Llama-3-8B-Instruct
    model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
)

In [None]:
# 9. Prompt Template for Retrieval-Augmented Generation
prompt = ChatPromptTemplate.from_template(
    """
You are a data analyst. Use the following extracted data from a CSV to answer the user's question.
If the answer requires comparison or aggregation, show your reasoning.
If the answer is not in the data, say "Not found in data."

Context:
{context}

Question: {input}
"""
)

In [None]:
# 10. Retrieval Chain
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
rag_chain = create_retrieval_chain(retriever, llm, prompt=prompt)

In [None]:
# 11. ipywidgets Chat Interface
chat_history = []

input_box = widgets.Text(
    value='',
    placeholder='Ask a question about your CSV...',
    description='Query:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)
output_area = widgets.Output()
send_button = widgets.Button(description="Send", button_style='primary')

def on_send_clicked(b):
    query = input_box.value
    input_box.value = ''
    with output_area:
        print(f"\nUser: {query}")
    # Run RAG chain
    result = rag_chain.invoke({"input": query})
    answer = result['answer'] if 'answer' in result else result
    with output_area:
        print(f"Bot: {answer}")

send_button.on_click(on_send_clicked)

chat_ui = widgets.VBox([input_box, send_button, output_area])
display(chat_ui)