# Importing the required libraries

In [None]:
# Install necessary libraries
!pip install transformers faiss-cpu sentence-transformers pandas numpy
!pip install scholarly
!pip install PyMuPDF
!pip install nltk
!pip install scikit-learn


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.8.0.post1 sentence-transformers-3.1.1
Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting arrow (from scholarly)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting bibtex

In [None]:
import os
import json
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import fitz
from scholarly import scholarly
import re


In [None]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
project_dirs = ['data/raw', 'data/processed', 'models', 'outputs']

for dir in project_dirs:
    os.makedirs(dir, exist_ok=True)


In [None]:
# Configuration parameters
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'  # Efficient for semantic search
GENERATION_MODEL_NAME = 't5-base'
TOP_K = 5  # Number of documents to retrieve
CHUNK_SIZE = 500  # Number of words per document chunk
DATA_SOURCES = ['arxiv', 'pubmed']


# Data Collection and Preprocessing

In [None]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=f1548b2033aac7250825e20c0048b37b5cdb0d48f6c59ab74c2d2a8466cc001a
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packag

In [None]:
import arxiv

In [None]:
# Define search parameters
ARXIV_QUERY = "machine learning"
MAX_RESULTS_ARXIV = 100  # Number of papers to fetch


In [None]:
def fetch_arxiv_papers(query, max_results=100):
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    papers = []
    for result in search.results():
        paper = {
            'title': result.title,
            'authors': [author.name for author in result.authors],
            'abstract': result.summary,
            'published': result.published.strftime('%Y-%m-%d'),
            'pdf_url': result.pdf_url,
            'categories': result.categories,
            'id': result.get_short_id()
        }
        papers.append(paper)
    return papers

# Fetch ArXiv papers
arxiv_papers = fetch_arxiv_papers(ARXIV_QUERY, MAX_RESULTS_ARXIV)
print(f"Fetched {len(arxiv_papers)} papers from ArXiv.")


  for result in search.results():


Fetched 100 papers from ArXiv.


In [None]:
# Save ArXiv metadata to JSON
with open('data/raw/arxiv_papers.json', 'w') as f:
    json.dump(arxiv_papers, f, indent=2)


In [None]:
import fitz  # PyMuPDF
import requests


In [None]:
def download_pdf(url, save_path):
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"Failed to download PDF from {url}: {e}")
        return False


In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Failed to extract text from {pdf_path}: {e}")
        return ""


In [None]:
def process_arxiv_papers(papers, download_dir='data/raw/arxiv_pdfs/'):
    os.makedirs(download_dir, exist_ok=True)
    for paper in papers:
        pdf_url = paper['pdf_url']
        paper_id = paper['id']
        pdf_path = os.path.join(download_dir, f"{paper_id}.pdf")

        # Download PDF if not already downloaded
        if not os.path.exists(pdf_path):
            success = download_pdf(pdf_url, pdf_path)
            if not success:
                continue

        # Extract text
        text = extract_text_from_pdf(pdf_path)
        if text:
            paper['full_text'] = text
        else:
            paper['full_text'] = ""

    return papers

# Process ArXiv papers
arxiv_papers = process_arxiv_papers(arxiv_papers)
print("Completed downloading and extracting PDFs from ArXiv.")


Failed to download PDF from http://arxiv.org/pdf/2206.07090v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2206.07090v2
Failed to download PDF from http://arxiv.org/pdf/1607.02450v2: 500 Server Error: Internal Server Error for url: http://arxiv.org/pdf/1607.02450v2
Failed to download PDF from http://arxiv.org/pdf/1207.4676v2: 500 Server Error: Internal Server Error for url: http://arxiv.org/pdf/1207.4676v2
Completed downloading and extracting PDFs from ArXiv.


In [None]:
# Save enriched ArXiv data to JSON
with open('data/raw/arxiv_papers_full.json', 'w') as f:
    json.dump(arxiv_papers, f, indent=2)


In [None]:
# Load ArXiv enriched data
with open('data/raw/arxiv_papers_full.json', 'r') as f:
    arxiv_papers_full = json.load(f)

# Display the first ArXiv paper's information
print("Sample ArXiv Paper:")
print(json.dumps(arxiv_papers_full[0], indent=2))


Sample ArXiv Paper:
{
  "title": "Lecture Notes: Optimization for Machine Learning",
  "authors": [
    "Elad Hazan"
  ],
  "abstract": "Lecture notes on optimization for machine learning, derived from a course at\nPrinceton University and tutorials given in MLSS, Buenos Aires, as well as\nSimons Foundation, Berkeley.",
  "published": "2019-09-08",
  "pdf_url": "http://arxiv.org/pdf/1909.03550v1",
  "categories": [
    "cs.LG",
    "stat.ML"
  ],
  "id": "1909.03550v1",
  "full_text": "lecture notes:\nOptimization for Machine Learning\nversion 0.57\nAll rights reserved.\nElad Hazan 1\n1www.cs.princeton.edu/~ehazan\narXiv:1909.03550v1  [cs.LG]  8 Sep 2019\nii\nPreface\nThis text was written to accompany a series of lectures given at the Machine\nLearning Summer School Buenos Aires, following a lecture series at the\nSimons Center for Theoretical Computer Science, Berkeley. It was extended\nfor the course COS 598D - Optimization for Machine Learning, Princeton\nUniversity, Spring 2019.\n

In [None]:
import json
import os

 arxiv_full_path = 'data/raw/arxiv_papers_full.json'

# Check if the file exists
if not os.path.exists(arxiv_full_path):
    print(f"File {arxiv_full_path} does not exist.")
else:
    # Load the data
    with open(arxiv_full_path, 'r') as f:
        arxiv_papers_full = json.load(f)
    print(f"Loaded {len(arxiv_papers_full)} ArXiv papers.")



Loaded 100 ArXiv papers.


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK data if not already done
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Remove non-ASCII characters
    text = text.encode("ascii", errors="ignore").decode()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning to all papers
for paper in arxiv_papers_full:
    if 'full_text' in paper and paper['full_text']:
        paper['cleaned_text'] = clean_text(paper['full_text'])
    else:
        paper['cleaned_text'] = ""


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def split_into_chunks(text, max_words=500, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + max_words
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += max_words - overlap  # Overlap to maintain context
    return chunks

# Apply segmentation to all papers
arxiv_chunks = []
for paper in arxiv_papers_full:
    if 'cleaned_text' in paper and paper['cleaned_text']:
        chunks = split_into_chunks(paper['cleaned_text'], max_words=500, overlap=50)
        for idx, chunk in enumerate(chunks):
            arxiv_chunks.append({
                'paper_id': paper['id'],
                'title': paper['title'],
                'authors': paper['authors'],
                'published': paper['published'],
                'categories': paper['categories'],
                'chunk_id': f"{paper['id']}_chunk_{idx+1}",
                'text': chunk
            })
    else:
        continue

print(f"Created {len(arxiv_chunks)} text chunks from ArXiv papers.")


Created 1861 text chunks from ArXiv papers.


In [None]:
processed_arxiv_path = 'data/processed/arxiv_chunks.json'

# Save the chunks to a JSON file
with open(processed_arxiv_path, 'w') as f:
    json.dump(arxiv_chunks, f, indent=2)

print(f"Saved processed data to {processed_arxiv_path}.")


Saved processed data to data/processed/arxiv_chunks.json.


In [None]:
# Load the processed data
with open(processed_arxiv_path, 'r') as f:
    arxiv_chunks = json.load(f)

# Display a sample chunk
sample_chunk = arxiv_chunks[0]
print("Sample Processed Chunk:")
print(json.dumps(sample_chunk, indent=2))


Sample Processed Chunk:
{
  "paper_id": "1909.03550v1",
  "title": "Lecture Notes: Optimization for Machine Learning",
  "authors": [
    "Elad Hazan"
  ],
  "published": "2019-09-08",
  "categories": [
    "cs.LG",
    "stat.ML"
  ],
  "chunk_id": "1909.03550v1_chunk_1",
  "text": "lecture notes optimization for machine learning version all rights reserved elad hazan wwwcsprincetoneduehazan arxivv cslg sep ii preface this text was written to accompany a series of lectures given at the machine learning summer school buenos aires following a lecture series at the simons center for theoretical computer science berkeley it was extended for the course cos d optimization for machine learning princeton university spring i am grateful to paula gradu for proofreading parts of this manuscript im also thankful for the help of the following students and colleagues for corrections and suggestions to this text udaya ghai john hallman noe pion xinyi chen iii iv preface figure professor arkadi nemiro

# Document Embedding and Indexing

In [None]:
import json

# Load the processed ArXiv chunks
with open('data/processed/arxiv_chunks.json', 'r') as f:
    arxiv_chunks = json.load(f)

print(f"Loaded {len(arxiv_chunks)} ArXiv text chunks for embedding.")


Loaded 1861 ArXiv text chunks for embedding.


In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Initialize the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)


Using device: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import numpy as np
from tqdm import tqdm

def generate_embeddings(chunks, model, batch_size=32):
    texts = [chunk['text'] for chunk in chunks]
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False)
        embeddings.append(batch_embeddings)

    embeddings = np.vstack(embeddings)
    return embeddings

# Generate embeddings
arxiv_embeddings = generate_embeddings(arxiv_chunks, embedding_model, batch_size=32)
print(f"Generated embeddings with shape: {arxiv_embeddings.shape}")


Generating Embeddings: 100%|██████████| 59/59 [00:09<00:00,  6.22it/s]

Generated embeddings with shape: (1861, 384)





In [None]:
import faiss

embedding_dim = arxiv_embeddings.shape[1]

# Initialize a FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # Using L2 distance

# Add embeddings to the index
index.add(arxiv_embeddings)
print(f"FAISS index contains {index.ntotal} vectors.")


FAISS index contains 1861 vectors.


In [None]:
# Save the FAISS index
faiss.write_index(index, 'models/arxiv_faiss.index')
print("Saved FAISS index to models/arxiv_faiss.index.")

# Save the embeddings
np.save('models/arxiv_embeddings.npy', arxiv_embeddings)
print("Saved embeddings to models/arxiv_embeddings.npy.")


Saved FAISS index to models/arxiv_faiss.index.
Saved embeddings to models/arxiv_embeddings.npy.


In [None]:
loaded_index = faiss.read_index('models/arxiv_faiss.index')

# Example: Querying the first chunk's embedding to find similar chunks
query_embedding = arxiv_embeddings[0].reshape(1, -1)
k = 5  # Number of nearest neighbors

distances, indices = loaded_index.search(query_embedding, k)
print(f"Top {k} nearest neighbors for the first chunk:")
for i in range(k):
    neighbor_idx = indices[0][i]
    neighbor_chunk = arxiv_chunks[neighbor_idx]
    print(f"{i+1}. Chunk ID: {neighbor_chunk['chunk_id']}, Title: {neighbor_chunk['title']}")


Top 5 nearest neighbors for the first chunk:
1. Chunk ID: 1909.03550v1_chunk_1, Title: Lecture Notes: Optimization for Machine Learning
2. Chunk ID: 1909.03550v1_chunk_44, Title: Lecture Notes: Optimization for Machine Learning
3. Chunk ID: 1909.03550v1_chunk_53, Title: Lecture Notes: Optimization for Machine Learning
4. Chunk ID: 1906.06821v2_chunk_1, Title: A Survey of Optimization Methods from a Machine Learning Perspective
5. Chunk ID: 1906.06821v2_chunk_49, Title: A Survey of Optimization Methods from a Machine Learning Perspective


# Retrieval Module Development

In [None]:
index = faiss.read_index('models/arxiv_faiss.index')

# Load the processed ArXiv chunks
with open('data/processed/arxiv_chunks.json', 'r') as f:
    arxiv_chunks = json.load(f)

print("Loaded FAISS index and ArXiv chunks for retrieval.")


Loaded FAISS index and ArXiv chunks for retrieval.


In [None]:
def retrieve_similar_chunks(query, model, index, chunks, top_k=5):
    # Generate embedding for the query
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Search in FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the corresponding chunks
    retrieved_chunks = [chunks[idx] for idx in indices[0]]

    return retrieved_chunks


In [None]:
sample_query = "What is Gradient Descent algorithm?"

# Retrieve similar chunks
retrieved = retrieve_similar_chunks(sample_query, embedding_model, index, arxiv_chunks, top_k=5)

# Display the retrieved chunks
for i, chunk in enumerate(retrieved, 1):
    print(f"--- Chunk {i} ---")
    print(f"Title: {chunk['title']}")
    print(f"Authors: {', '.join(chunk['authors'])}")
    print(f"Published: {chunk['published']}")
    print(f"Categories: {', '.join(chunk['categories'])}")
    print(f"Chunk ID: {chunk['chunk_id']}")
    print(f"Text: {chunk['text'][:200]}...")
    print("\n")


--- Chunk 1 ---
Title: A comprehensive review of Quantum Machine Learning: from NISQ to Fault Tolerance
Authors: Yunfei Wang, Junyu Liu
Published: 2024-01-21
Categories: quant-ph, cs.AI, cs.LG, stat.ML
Chunk ID: 2401.11351v2_chunk_13
Text: more computationally practical the essence of this section can be summarized as follows when presented with an objective function or loss function denoted as l our goal is to identify its minima the s...


--- Chunk 2 ---
Title: Category Theory in Machine Learning
Authors: Dan Shiebler, Bruno Gavranović, Paul Wilson
Published: 2021-06-13
Categories: cs.LG
Chunk ID: 2106.07032v1_chunk_4
Text: parameter updates and learning finally we discuss how lensbased formalisms for learning capture the various machine learning algorithms used in practice since pixels are not actually realvalued we may...


--- Chunk 3 ---
Title: A Survey of Optimization Methods from a Machine Learning Perspective
Authors: Shiliang Sun, Zehui Cao, Han Zhu, Jing Zhao
Published: 201

# Generation Module Integration

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize the tokenizer and model
generation_model_name = 't5-base'  # We can use 't5-large' for better performance
generation_tokenizer = AutoTokenizer.from_pretrained(generation_model_name)
generation_model = AutoModelForSeq2SeqLM.from_pretrained(generation_model_name).to(device)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def generate_answer(query, retrieved_chunks, tokenizer, model, max_length=200):
    # Concatenate the texts from retrieved chunks
    context = " ".join([chunk['text'] for chunk in retrieved_chunks])

    # Add the retrieved context to the question
    input_text = f"question: {query} context: {context}"

    # Tokenize the input
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=512).to(device)

    # Generate the answer
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True) #We can use OpenAI's GPT models for better performance

    # Decode the generated tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer


In [None]:
# Sample query
sample_query = "What is machine learning?"

# Retrieve similar chunks
retrieved_chunks = retrieve_similar_chunks(sample_query, embedding_model, index, arxiv_chunks, top_k=5)

# Generate an answer
answer = generate_answer(sample_query, retrieved_chunks, generation_tokenizer, generation_model)

print(f"Generated Answer:\n{answer}")


Generated Answer:
the process in which computers learn to make decisions based on the given data set


In [None]:
!zip -r rag_pipeline_project.zip data/ models/ outputs/ app.py
# Add any other directories or files as needed


  adding: data/ (stored 0%)
  adding: data/raw/ (stored 0%)
  adding: data/raw/pubmed_pdfs/ (stored 0%)
  adding: data/raw/pubmed_pdfs/39341632.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39341153.pdf (deflated 56%)
  adding: data/raw/pubmed_pdfs/39341210.pdf (deflated 77%)
  adding: data/raw/pubmed_pdfs/39340586.pdf (deflated 81%)
  adding: data/raw/pubmed_pdfs/39341304.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39338970.pdf (deflated 85%)
  adding: data/raw/pubmed_pdfs/39341638.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39341637.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39340756.pdf (deflated 77%)
  adding: data/raw/pubmed_pdfs/39341499.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39341043.pdf (deflated 57%)
  adding: data/raw/pubmed_pdfs/39340015.pdf (deflated 81%)
  adding: data/raw/pubmed_pdfs/39340739.pdf (deflated 78%)
  adding: data/raw/pubmed_pdfs/39341876.pdf (deflated 79%)
  adding: data/raw/pubmed_papers.json (deflated 68%)
  adding: data/ra

In [None]:
from google.colab import files

files.download('rag_pipeline_project.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>