In [1]:
# New notebook cell - PDF processing
import requests
import PyPDF2
import openai
from io import BytesIO

def extract_text_from_pdf_url(pdf_url):
    """Download PDF and extract text"""
    # Download PDF
    response = requests.get(pdf_url)
    
    # Read PDF content
    pdf_file = BytesIO(response.content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Extract text from all pages
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap  # Overlap for continuity
    
    return chunks
    
def process_article_fully(article_id):
    """Take an article from metadata_only to fully_processed"""
    # 1. Get article info
    article = supabase.table('articles').select('*').eq('id', article_id).execute()
    article = article.data[0]
    
    # 2. Extract PDF text
    text = extract_text_from_pdf_url(article['pdf_url'])
    
    # 3. Chunk text
    chunks = chunk_text(text)
    
    # 4. Process each chunk
    for i, chunk in enumerate(chunks):
        # Generate embedding
        embedding = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )['data'][0]['embedding']
        
        # Store chunk
        chunk_data = {
            'article_id': article_id,
            'chunk_text': chunk,
            'chunk_index': i,
            'embedding': embedding
        }
        supabase.table('article_chunks').insert(chunk_data).execute()
    
    # 5. Update article status
    supabase.table('articles').update({'processing_status': 'fully_processed'}).eq('id', article_id).execute()

In [2]:
pdf_url = "https://arxiv.org/pdf/1905.11833"
x = extract_text_from_pdf_url(pdf_url)

In [4]:
len(x)

61036

In [7]:
x[:150].strip()

'Interpreting and improving natural-language\nprocessing (in machines) with natural\nlanguage-processing (in the brain)\nMariya Toneva\nNeuroscience Instit'

In [6]:
y = chunk_text(x[:1500])

In [10]:
len(y)

2

In [13]:
y[0], "...", "...", y[1]

('Interpreting and improving natural-language\nprocessing (in machines) with natural\nlanguage-processing (in the brain)\nMariya Toneva\nNeuroscience Institute\nDepartment of Machine Learning\nCarnegie Mellon University\nmariya@cmu.eduLeila Wehbe\nNeuroscience Institute\nDepartment of Machine Learning\nCarnegie Mellon University\nlwehbe@cmu.edu\nAbstract\nNeural networks models for NLP are typically implemented without the explicit\nencoding of language rules and yet they are able to break one performance record\nafter another. This has generated a lot of research interest in interpreting the\nrepresentations learned by these networks. We propose here a novel interpretation\napproach that relies on the only processing system we have that does understand\nlanguage: the human brain. We use brain imaging recordings of subjects reading\ncomplex natural text to interpret word and sequence embeddings from 4recent\nNLP models - ELMo, USE, BERT and Transformer-XL. We study how their\nrepresent

In [14]:
embedding = openai.Embedding.create(input=y[0], model="text-embedding-ada-002")['data'][0]['embedding']

APIRemovedInV1: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [3]:
import os
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Create OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Test with your chunk
def generate_embedding(text):
    """Generate embedding using new OpenAI API"""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

# Test it
test_text = "This is a test sentence for embedding generation."
embedding = generate_embedding(test_text)

print(f"✅ Generated embedding with {len(embedding)} dimensions")
print(f"First 5 values: {embedding[:5]}")

✅ Generated embedding with 1536 dimensions
First 5 values: [-0.025302225723862648, -0.0005367214907892048, -0.0003945132193621248, 0.0063355653546750546, -0.0010434165596961975]


In [4]:
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def generate_embedding(text):
    """Generate embedding using OpenAI"""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

In [8]:
x = generate_embedding(y[0])

In [9]:
x[:10]

[-0.03363104164600372,
 0.009249209426343441,
 0.023493262007832527,
 -0.03346948325634003,
 0.00556029612198472,
 0.012776564806699753,
 0.010097390040755272,
 0.01740790158510208,
 -0.02598395198583603,
 -0.04103579372167587]

In [10]:
import requests
import PyPDF2
from io import BytesIO
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def extract_text_from_pdf_url(pdf_url):
    """Download PDF and extract text"""
    print(f"Downloading PDF from: {pdf_url}")
    response = requests.get(pdf_url)
    
    pdf_file = BytesIO(response.content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    text = ""
    for page_num, page in enumerate(pdf_reader.pages):
        text += page.extract_text()
        print(f"  Extracted page {page_num + 1}/{len(pdf_reader.pages)}")
    
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

def generate_embedding(text):
    """Generate embedding using OpenAI"""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

def process_article_fully(article_id):
    """Process an article: extract PDF, chunk, embed, store"""
    print(f"\n{'='*60}")
    print(f"Processing article: {article_id}")
    print('='*60)
    
    # 1. Get article from database
    result = supabase.table('articles').select('*').eq('id', article_id).execute()
    if not result.data:
        print("❌ Article not found")
        return
    
    article = result.data[0]
    print(f"📄 Title: {article['title']}")
    
    # 2. Extract text from PDF
    print("\n📥 Extracting PDF text...")
    text = extract_text_from_pdf_url(article['pdf_url'])
    print(f"✅ Extracted {len(text)} characters")
    
    # 3. Chunk the text
    print("\n✂️  Chunking text...")
    chunks = chunk_text(text, chunk_size=1000, overlap=200)
    print(f"✅ Created {len(chunks)} chunks")
    
    # 4. Process each chunk
    print("\n🔄 Generating embeddings and storing chunks...")
    for i, chunk in enumerate(chunks):
        print(f"  Processing chunk {i+1}/{len(chunks)}...", end='\r')
        
        # Generate embedding
        embedding = generate_embedding(chunk)
        
        # Store in database
        chunk_data = {
            'article_id': article_id,
            'chunk_text': chunk,
            'chunk_index': i,
            'embedding': embedding
        }
        
        supabase.table('article_chunks').insert(chunk_data).execute()
    
    print(f"\n✅ Stored {len(chunks)} chunks")
    
    # 5. Update article status
    print("\n📝 Updating article status...")
    supabase.table('articles')\
        .update({'processing_status': 'fully_processed'})\
        .eq('id', article_id)\
        .execute()
    
    print("✅ Article fully processed!")
    return len(chunks)

# Test with one of your saved papers
print("Getting a paper to process...")
result = supabase.table('articles')\
    .select('*')\
    .eq('processing_status', 'metadata_only')\
    .limit(1)\
    .execute()

if result.data:
    paper = result.data[0]
    print(f"Found paper: {paper['title'][:60]}...")
    
    # Process it
    num_chunks = process_article_fully(paper['id'])
    
    # Verify chunks were stored
    chunks_result = supabase.table('article_chunks')\
        .select('*')\
        .eq('article_id', paper['id'])\
        .execute()
    
    print(f"\n📊 Verification: {len(chunks_result.data)} chunks in database")
else:
    print("No unprocessed papers found")

Getting a paper to process...


NameError: name 'supabase' is not defined