In [3]:
import streamlit as st
import pandas as pd
import re

In [None]:
DATASET_PATH = r"C:\Users\Roopesh P\OneDrive\Pictures\Documents\Task 4\arxiv-metadata-oai-snapshot.json"  #add path your file

def load_arxiv_data(json_path: str):
    try:
        df = pd.read_json(json_path, lines=True)
    except Exception as e:
        st.error(f"Error loading data: {e}")
        df = pd.DataFrame()
    return df

def basic_search(query: str, data: pd.DataFrame, field: str = 'title', max_results: int = 5):
    if field not in data.columns:
        return pd.DataFrame()  
    pattern = re.compile(query, re.IGNORECASE)
    results = data[data[field].str.contains(pattern, na=False)]
    return results.head(max_results)

def dummy_summarize_text(text: str) -> str:
    if len(text.split()) > 50:
        return text[:200] + "..."
    return text

def main():
    st.title("ArXiv Expert Chatbot")
    st.write("This application allows you to explore scientific papers from the arXiv dataset.")

    st.subheader("Load Dataset")
    if st.button("Load"):
        st.write("Loading data...")
        data = load_arxiv_data(DATASET_PATH)
        st.session_state["arxiv_data"] = data
        st.success("Dataset loaded successfully!")

    if "arxiv_data" in st.session_state:
        st.subheader("Ask a Question")
        user_query = st.text_input("Search term or question about papers (e.g., 'quantum computing'):")
        
        if user_query:
            results = basic_search(user_query, st.session_state["arxiv_data"], field='title')
            if not results.empty:
                st.write(f"Showing top {len(results)} results:")
                for i, row in results.iterrows():
                    st.markdown(f"### Title: {row.get('title', 'N/A')}")
                    st.markdown(f"**Authors:** {row.get('authors', 'N/A')}")
                    st.markdown(f"**Category:** {row.get('categories', 'N/A')}")
                    
                    abstract_text = row.get('abstract', 'N/A')
                    summary_text = dummy_summarize_text(abstract_text)
                    st.markdown(f"**Summary:** {summary_text}")
                    st.markdown("---")
            else:
                st.write("No matching papers found. Try a different search term.")

if __name__ == "__main__":
    main()