In [2]:
!pip uninstall -y langchain langchain-core langchain-community
!pip install --upgrade langchain faiss-cpu sentence-transformers streamlit langchain-community pyngrok

import os
import pandas as pd
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Load CSV corpus
corpus_df = pd.read_csv('/content/drive/MyDrive/context-aware chatbot/custom_corpus.csv')

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

corpus_df['cleaned_text'] = corpus_df['text'].apply(preprocess_text)


Found existing installation: langchain 1.2.0
Uninstalling langchain-1.2.0:
  Successfully uninstalled langchain-1.2.0
Found existing installation: langchain-core 1.2.5
Uninstalling langchain-core-1.2.5:
  Successfully uninstalled langchain-core-1.2.5
Found existing installation: langchain-community 0.4.1
Uninstalling langchain-community-0.4.1:
  Successfully uninstalled langchain-community-0.4.1
Collecting langchain
  Using cached langchain-1.2.0-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-community
  Using cached langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core<2.0.0,>=1.2.1 (from langchain)
  Using cached langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Using cached langchain-1.2.0-py3-none-any.whl (102 kB)
Using cached langchain_community-0.4.1-py3-none-any.whl (2.5 MB)
Using cached langchain_core-1.2.5-py3-none-any.whl (484 kB)
Installing collected packages: langchain-core, langchain-community, langchain
Successfully installe

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

documents = [Document(page_content=text) for text in corpus_df['cleaned_text']]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_index")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
%%writefile app.py
# app.py
import os
import streamlit as st
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint

# -----------------------------
# HUGGINGFACE TOKEN
# -----------------------------
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "-"  # replace with your token

# -----------------------------
# NLTK setup
# -----------------------------
nltk.download('punkt')
nltk.download('stopwords')

# -----------------------------
# Streamlit page config
# -----------------------------
st.set_page_config(page_title="Context-Aware Chatbot")
st.title("Context-Aware Chatbot (CSV + FAISS)")

# -----------------------------
# Load CSV corpus
# -----------------------------
corpus_df = pd.read_csv('/content/drive/MyDrive/context-aware chatbot/custom_corpus.csv')

# -----------------------------
# Text preprocessing
# -----------------------------
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

corpus_df['cleaned_text'] = corpus_df['text'].apply(preprocess_text)

# -----------------------------
# Convert to Document objects
# -----------------------------
documents = [Document(page_content=text) for text in corpus_df['cleaned_text']]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

# -----------------------------
# Create embeddings & FAISS
# -----------------------------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_index")  # Save locally

# -----------------------------
# Load vectorstore
# -----------------------------
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# -----------------------------
# Load LLM
# -----------------------------
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-base",
    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    task="text2text-generation"
)

# -----------------------------
# Initialize chat history
# -----------------------------
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

# -----------------------------
# User input
# -----------------------------
query = st.text_input("You:")

if query:
    # Retrieve top 3 relevant chunks
    docs = vectorstore.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in docs])

    # Prepare prompt
    prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate response
    response = llm(prompt)  # call directly

    # Save to chat history
    st.session_state.chat_history.append(("You", query))
    st.session_state.chat_history.append(("Bot", response))

# -----------------------------
# Clear chat
# -----------------------------
if st.button("Clear Chat"):
    st.session_state.chat_history = []

# -----------------------------
# Display chat history
# -----------------------------
for speaker, message in st.session_state.chat_history:
    if speaker == "You":
        st.markdown(f"**You:** {message}")
    else:
        st.markdown(f"**Bot:** {message}")


Overwriting app.py
