In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import os
import faiss
import torch
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import kagglehub

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

🚀 Using device: cuda


In [None]:
import kagglehub
path = kagglehub.dataset_download("benhamner/nips-papers")
print("Path to dataset files:", path)

df = pd.read_csv(f"{path}/papers.csv")
texts = df['paper_text'].fillna('')

Path to dataset files: /kaggle/input/nips-papers


In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\n', ' ', text)
    return text.strip()

df['clean_text'] = df['paper_text'].apply(clean_text)

In [None]:
print("📊 Training LDA model...")
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_matrix = lda.fit_transform(tfidf_matrix)

📊 Training LDA model...


In [None]:
def display_topics(model, feature_names, top_words=5):
    for idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-top_words - 1:-1]]
        print(f"🧠 Topic {idx+1}: {', '.join(top_features)}")

display_topics(lda, vectorizer.get_feature_names_out())

🧠 Topic 1: policy, regret, reward, action, state
🧠 Topic 2: tensor, pca, manifold, principal, rank
🧠 Topic 3: model, data, posterior, distribution, models
🧠 Topic 4: image, training, model, learning, images
🧠 Topic 5: algorithm, matrix, data, learning, theorem
🧠 Topic 6: visual, stimulus, cells, motion, model
🧠 Topic 7: topic, document, word, words, dirichlet
🧠 Topic 8: network, input, neural, neurons, networks
🧠 Topic 9: tree, node, nodes, trees, belief
🧠 Topic 10: ci, 32, kl, ii, ij


In [None]:
print("🔗 Generating embeddings...")
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
embeddings = model.encode(df['clean_text'].tolist(), convert_to_numpy=True, normalize_embeddings=True, batch_size=32, show_progress_bar=True)

🔗 Generating embeddings...


Batches:   0%|          | 0/227 [00:00<?, ?it/s]

In [None]:
print("🧠 Building FAISS index...")
dimension = embeddings.shape[1]
faiss.normalize_L2(embeddings)
index = faiss.IndexHNSWFlat(dimension, 32)
index.add(embeddings)

🧠 Building FAISS index...


In [None]:
def retrieve_papers(query, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    faiss.normalize_L2(query_embedding)
    _, idxs = index.search(query_embedding, top_k)
    return df.iloc[idxs[0]][['title', 'paper_text']]

In [None]:
print("📝 Loading summarization model...")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0 if device == "cuda" else -1)

📝 Loading summarization model...


Device set to use cuda:0


In [None]:
def summarize_text(text):
    max_chunk = 500
    text = clean_text(text)
    sentences = text.split('. ')
    current_chunk = ''
    chunks = []

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    chunks.append(current_chunk.strip())

    summaries = []
    for chunk in chunks[:2]:  # summarize only top 2 chunks
        summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    return ' '.join(summaries)

In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.26.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.9.0 (from gradio)
  Downloading gradio_client-1.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [None]:
import gradio as gr
def gradio_chatbot(query):
    papers = retrieve_papers(query)
    output = ""
    for i, (title, text) in enumerate(zip(papers['title'], papers['paper_text'])):
        summary = summarize_text(text)
        output += f"### 📖 {i+1}. {title}\n"
        output += f"🔹 {summary}\n\n"
    return output.strip()

# Gradio Interface
interface = gr.Interface(
    fn=gradio_chatbot,
    inputs=gr.Textbox(placeholder="Ask about Hidden Markov Models or any topic...", lines=2, label="🔍 Research Query"),
    outputs=gr.Markdown(label="📚 Results"),
    title="🧠 AI-Powered Research Paper Finder",
    description="Enter a query (e.g., 'hidden markov model', 'neural networks') to get relevant NIPS paper summaries.",
    theme="default",
)

interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5746dd6abb9db651ef.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
if __name__ == "__main__":
    chatbot("Any paper about machine learning?")


🔍 Searching for relevant research...

📖 1. Using Machine Learning to Break Visual Human Interaction Proofs (HIPs)
   🔹 Summary:  Using Machine Learning to Break Visual Guidelines, we look for tasks where machine learning algorithms are not as good as humans with the hope of gaining insight into their current limitations .

📖 2. Online Classification on a Budget
   🔹 Summary:  In this paper we describe and analyze the simpiness of online algorithms for classification on a budget . The paper is published at the Hebrew University of Jerusalem and the University of London .

📖 3. Temporal Dynamics of Generalization in Neural Networks
   🔹 Summary:  The paper presents a rigorous characterization of how a general.nonlinear learning machine generalizes during the training process . It is trained on a random sample using a gradient .
