<a href="https://colab.research.google.com/github/Amisha1019/Gen-AI-Customer-Service-Bot-Internship-Task/blob/main/Task_4_of_Gen_Ai_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit sentence-transformers faiss-cpu transformers torch pandas scikit-learn


Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, pydeck, streamlit
Successfully installed fais

In [60]:
from google.colab import files
uploaded = files.upload()


Saving streamlit_arxiv_chatbot.py to streamlit_arxiv_chatbot (1).py


In [62]:
!streamlit run streamlit_arxiv_chatbot.py --server.port 8501 &>/dev/null&


In [63]:
!pip install kagglehub[pandas-datasets]



In [64]:
import kagglehub
from kagglehub import KaggleDatasetAdapter


In [12]:
file_path = "https://www.kaggle.com/datasets/Cornell-University/arxiv"

In [16]:
import os
import sys
import argparse
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.preprocessing import normalize
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import json
from typing import List, Tuple

In [18]:
@st.cache_resource
def load_sentence_model(name: str = 'all-MiniLM-L6-v2'):
    return SentenceTransformer(name)

In [20]:
@st.cache_data
def load_arxiv_csv(path: str, filter_prefix: str = 'cs') -> pd.DataFrame:
    df = pd.read_csv(path)

    if 'categories' in df.columns:
        mask = df['categories'].fillna('').str.startswith(filter_prefix)
        df = df[mask].reset_index(drop=True)

    cols = [c for c in ['id', 'title', 'abstract', 'categories', 'authors', 'doi', 'submit_date', 'pdf_url'] if c in df.columns]
    return df[cols]



In [22]:
@st.cache_data
def compute_embeddings(texts: List[str], model_name: str = 'all-MiniLM-L6-v2', batch_size: int = 64) -> np.ndarray:
    model = load_sentence_model(model_name)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    embeddings = normalize(embeddings)
    return embeddings



In [84]:
@st.cache_resource
def build_faiss_index(embeddings: np.ndarray) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    return index, embeddings

In [66]:
def retrieve(query: str, index: faiss.IndexFlatIP, embedder: SentenceTransformer, df: pd.DataFrame, top_k: int = 5) -> pd.DataFrame:
    q_emb = embedder.encode([query], convert_to_numpy=True)
    q_emb = normalize(q_emb)
    D, I = index.search(q_emb, top_k)
    hits = df.iloc[I[0]].copy()
    hits['score'] = D[0]
    return hits

In [67]:
@st.cache_resource
def get_summarizer(model_name: str = 'facebook/bart-large-cnn'):
    return pipeline('summarization', model=model_name, device=0 if torch.cuda.is_available() else -1)

In [68]:
class ExplanationEngine:
    def __init__(self, hf_model: str = None, openai_key: str = None):
        self.hf_model = hf_model
        self.openai_key = openai_key
        self.device = 0 if torch.cuda.is_available() else 'cpu'
        self.tokenizer = None
        self.model = None
        if hf_model:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(hf_model)
                self.model = AutoModelForCausalLM.from_pretrained(hf_model, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto' if torch.cuda.is_available() else None)
            except Exception as e:
                st.warning(f"Failed to load HF model {hf_model}: {e}")
                self.model = None

In [69]:
def explain(self, prompt: str, max_tokens: int = 256, temperature: float = 0.2) -> str:
    if self.model is not None:
        input_ids = self.tokenizer(prompt, return_tensors='pt').input_ids.to(self.device)
        outputs = self.model.generate(input_ids, max_new_tokens=max_tokens, do_sample=True, temperature=temperature)
        txt = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return txt
    else:

        if self.openai_key:
            try:
                import openai
                openai.api_key = self.openai_key
                resp = openai.ChatCompletion.create(
                    model='gpt-4o-mini' if False else 'gpt-4o-mini',
                    messages=[{'role':'system','content':'You are a helpful scientific research assistant.'},{'role':'user','content':prompt}],
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                return resp['choices'][0]['message']['content']
            except Exception as e:
                return f"OpenAI fallback failed: {e}"
        return "No explanation model available. Provide a Hugging Face model name or an OpenAI API key."

In [70]:
def build_rag_prompt(query: str, retrieved_papers: pd.DataFrame) -> str:
    prompt = "You are an expert research assistant. The user asks:\n" + query + "\n\nUse the following retrieved paper abstracts as context. Cite paper titles where helpful.\n\n"
    for i, row in retrieved_papers.iterrows():
        prompt += f"Title: {row.get('title','<no title>')}\nCategories: {row.get('categories','')}\nAbstract: {row.get('abstract','')}\n---\n"
    prompt += "\nAnswer thoroughly, explain concepts step-by-step, and suggest follow-up readings from the provided papers."
    return prompt

In [83]:
def main():
    st.set_page_config(page_title='ArXiv Domain Expert Chatbot', layout='wide')
    st.title('ArXiv Domain Expert Chatbot — Prototype')

    st.sidebar.header('Settings & Data')
    data_path = st.sidebar.text_input('Path to arXiv CSV (Kaggle)', value='./arxiv-metadata-oai-snapshot.csv')
    filter_prefix = st.sidebar.text_input('Filter category prefix', value='cs')
    model_name = st.sidebar.text_input('Embedding model (sentence-transformers)', value='all-MiniLM-L6-v2')
    hf_explain_model = st.sidebar.text_input('HF explanation model (optional)', value='')
    openai_key = st.sidebar.text_input('OpenAI API key (optional)', value='', type='password')

    st.sidebar.markdown('**Notes:** If you have a local GPU and a HF chat/causal model, set its name above to use it for explanations. Otherwise provide an OpenAI key for fallback. For production, consider hosted vector DBs and model serving.')

    df = None
    if not os.path.exists(data_path):
        st.warning('arXiv CSV not found at the given path. Please download from Kaggle: https://www.kaggle.com/datasets/Cornell-University/arxiv and point to the CSV file.')
    else:
        with st.spinner('Loading dataset...'):
            df = load_arxiv_csv(data_path, filter_prefix=filter_prefix)
            st.success(f'Loaded {len(df)} papers (filtered by {filter_prefix}*).')

    embeddings = None
    index = None
    if os.path.exists('embeddings.npz'):
        arr = np.load('embeddings.npz')
        embeddings = arr['embeddings']
        index = build_faiss_index(embeddings)

    if df is not None and st.button('Build / Rebuild embeddings & index'):
        with st.spinner('Computing embeddings... This may take a while on CPU.'):
            texts = (df['title'].fillna('') + '. ' + df['abstract'].fillna('')).tolist()
            embeddings = compute_embeddings(texts, model_name=model_name)
            np.savez_compressed('embeddings.npz', embeddings=embeddings)
            index = build_faiss_index(embeddings)
            st.success('Index built and embeddings saved to embeddings.npz')



    st.header('Search papers / Ask a question')
    query = st.text_input('Enter a research question or keywords', value='Transformer explainability methods')
    top_k = st.slider('Number of retrieved papers', 1, 10, 5)


    if st.button('Retrieve'):
        if index is None:
            st.error('No FAISS index found. Build embeddings first.')
        else:
            embedder = load_sentence_model(model_name)
            hits = retrieve(query, index, embedder, df, top_k=top_k)
            st.subheader('Top retrieved papers')
            for i, row in hits.iterrows():
                st.markdown(f"**{row['title']}** ")
                st.markdown(f"*Categories:* {row.get('categories','')} ")
                st.markdown(f"{row.get('abstract','')[:800]}{'...' if len(row.get('abstract',''))>800 else ''}")
                st.markdown(f"Score: {row['score']:.4f}")
                st.markdown('---')


            summarizer = get_summarizer()
            concat_abstracts = ' '.join(hits['abstract'].fillna('').tolist())[:2000]
            if concat_abstracts.strip():
                with st.spinner('Generating summary...'):
                    summary = summarizer(concat_abstracts, max_length=200, min_length=60, do_sample=False)[0]['summary_text']
                st.subheader('Retrieved-context summary')
                st.write(summary)



            engine = ExplanationEngine(hf_model=hf_explain_model if hf_explain_model.strip() else None, openai_key=openai_key if openai_key.strip() else None)
            rag_prompt = build_rag_prompt(query, hits)
            with st.spinner('Generating explanation...'):
                explanation = engine.explain(rag_prompt, max_tokens=400)
            st.subheader('Explanation / Step-by-step answer')
            st.write(explanation)


    st.sidebar.header('Developer / Next steps')
    st.sidebar.markdown('- Persist embeddings and metadata to a vector DB (Milvus, Weaviate, Pinecone) for scale.\n- Use a hosted LLM or GPU server for large HF models.\n- Add user authentication and paper download links.\n- Add citations with DOIs and an explanation of confidence.\n')

In [72]:
if st.button('Build / Rebuild embeddings & index'):
    with st.spinner('Computing embeddings... This may take a while on CPU.'):
        texts = (df['title'].fillna('') + '. ' + df['abstract'].fillna('')).tolist()
        embeddings = compute_embeddings(texts, model_name=model_name)
        np.savez_compressed('embeddings.npz', embeddings=embeddings)
        index = build_faiss_index(embeddings)
        st.success('Index built and embeddings saved to embeddings.npz')




In [73]:
if st.button('Retrieve'):
    if index is None:
        st.error('No FAISS index found. Build embeddings first.')
    else:
        embedder = load_sentence_model(model_name)
        hits = retrieve(query, index, embedder, df, top_k=top_k)
        st.session_state['hits'] = hits  # store for later use




In [74]:
if 'hits' not in st.session_state:
    st.session_state['hits'] = None

if st.button('Retrieve'):
    st.session_state['hits'] = retrieve(query, index, embedder, df, top_k=top_k)

hits = st.session_state['hits']
if hits is not None:
    st.subheader('Top retrieved papers')




In [75]:
embeddings = None
index = None
if os.path.exists('embeddings.npz'):
    arr = np.load('embeddings.npz')
    embeddings = arr['embeddings']
    index = build_faiss_index(embeddings)


st.header('Search papers / Ask a question')
query = st.text_input('Enter a research question or keywords', value='Transformer explainability methods')
top_k = st.slider('Number of retrieved papers', 1, 10, 5)


if st.button('Retrieve'):
    if index is None:
        st.error('No FAISS index found. Build embeddings first.')
    else:
        embedder = load_sentence_model(model_name)
        hits = retrieve(query, index, embedder, df, top_k=top_k)
        st.subheader('Top retrieved papers')
        for i, row in hits.iterrows():
            st.markdown(f"**{row['title']}** ")
            st.markdown(f"*Categories:* {row.get('categories','')} ")
            st.markdown(f"{row.get('abstract','')[:800]}{'...' if len(row.get('abstract',''))>800 else ''}")
            st.markdown(f"Score: {row['score']:.4f}")
            st.markdown('---')

            summarizer = get_summarizer()
concat_abstracts = 'abstracts'.join(hits['abstract'].fillna('').tolist())[:2000]
if concat_abstracts.strip():
    with st.spinner('Generating summary...'):
        summary = summarizer(concat_abstracts, max_length=200, min_length=60, do_sample=False)[0]['summary_text']
    st.subheader('Retrieved-context summary')
    st.write(summary)


# Explanation generation
engine = ExplanationEngine(hf_model=hf_explain_model if hf_explain_model.strip() else None, openai_key=openai_key if openai_key.strip() else None)
rag_prompt = build_rag_prompt(query, hits)
with st.spinner('Generating explanation...'):
    explanation = engine.explain(rag_prompt, max_tokens=400)
st.subheader('Explanation / Step-by-step answer')
st.write(explanation)


st.sidebar.header('Developer / Next steps')
st.sidebar.markdown('- Persist embeddings and metadata to a vector DB (Milvus, Weaviate, Pinecone) for scale.\n- Use a hosted LLM or GPU server for large HF models.\n- Add user authentication and paper download links.\n- Add citations with DOIs and an explanation of confidence.\n')



In [82]:
embeddings = None
index = None
if os.path.exists('embeddings.npz'):
    arr = np.load('embeddings.npz')
    embeddings = arr['embeddings']
    index = build_faiss_index(embeddings)


st.header('Search papers / Ask a question')
query = st.text_input('Enter a research question or keywords', value='Transformer explainability methods')
top_k = st.slider('Number of retrieved papers', 1, 10, 5)


if st.button('Retrieve'):
    if index is None:
        st.error('No FAISS index found. Build embeddings first.')
    else:
        embedder = load_sentence_model(model_name)
        hits = retrieve(query, index, embedder, df, top_k=top_k)
        st.subheader('Top retrieved papers')
        for i, row in hits.iterrows():
            st.markdown(f"**{row['title']}** ")
            st.markdown(f"*Categories:* {row.get('categories','')} ")
            st.markdown(f"{row.get('abstract','')[:800]}{'...' if len(row.get('abstract',''))>800 else ''}")
            st.markdown(f"Score: {row['score']:.4f}")
            st.markdown('---')


    # summarization example
    summarizer = get_summarizer()
    concat_abstracts = ' '.join(hits['abstract'].fillna('').tolist())[:2000]
    if concat_abstracts.strip():
        with st.spinner('Generating summary...'):
            summary = summarizer(concat_abstracts, max_length=200, min_length=60, do_sample=False)[0]['summary_text']
        st.subheader('Retrieved-context summary')
        st.write(summary)



    engine = ExplanationEngine(hf_model=hf_explain_model if hf_explain_model.strip() else None, openai_key=openai_key if openai_key.strip() else None)
    rag_prompt = build_rag_prompt(query, hits)
    with st.spinner('Generating explanation...'):
        explanation = engine.explain(rag_prompt, max_tokens=400)
    st.subheader('Explanation / Step-by-step answer')
    st.write(explanation)


st.sidebar.header('Developer / Next steps')
st.sidebar.markdown('- Persist embeddings and metadata to a vector DB (Milvus, Weaviate, Pinecone) for scale.\n- Use a hosted LLM or GPU server for large HF models.\n- Add user authentication and paper download links.\n- Add citations with DOIs and an explanation of confidence.\n')



DeltaGenerator(_root_container=1, _parent=DeltaGenerator())

In [80]:
if __name__ == '__main__':

    main()

