In [14]:
import requests

# List of PDF URLs
pdf_urls = [
    "https://www.fia.com/sites/default/files/fia_2026_f1_regulations_-_section_b_sporting_-_iss01_-_2024-10-17.pdf",
    "https://www.fia.com/sites/default/files/fia_2026_f1_regulations_-_section_c_technical_-_iss09_-_2024-10-17.pdf",
    "https://www.fia.com/sites/default/files/fia_2026_f1_regulations_-_section_c_technical_-_iss09_-_2024-10-17.pdf"
]

# Function to download a PDF from a given URL
def download_pdf(url, filename):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
                print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download {filename}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while downloading {filename}: {str(e)}")

# Download each PDF in the list
for idx, url in enumerate(pdf_urls):
    filename = f"document_{idx+1}.pdf"  # Save each PDF with a unique name
    download_pdf(url, filename)


Downloaded: document_1.pdf
Downloaded: document_2.pdf
Downloaded: document_3.pdf


In [16]:
pip install boto3


Note: you may need to restart the kernel to use updated packages.


In [18]:
import boto3
import os

# Step 3: Set up boto3 client
s3 = boto3.client('s3',
                  aws_access_key_id='key',
                  aws_secret_access_key='key',
                  region_name='us-east-2')

bucket_name = "f1poc"

# Upload PDFs to S3
for idx in range(1, 4):  # Assuming you have three PDFs downloaded
    pdf_filename = f"document_{idx}.pdf"
    if os.path.exists(pdf_filename):
        s3.upload_file(pdf_filename, bucket_name, pdf_filename)
        print(f"Uploaded {pdf_filename} to S3 bucket {bucket_name}")
    else:
        print(f"File {pdf_filename} not found for upload.")


Uploaded document_1.pdf to S3 bucket f1poc
Uploaded document_2.pdf to S3 bucket f1poc
Uploaded document_3.pdf to S3 bucket f1poc


In [20]:
pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [36]:
# API Keys (Keep this cell private and do not share your notebook)
PINECONE_API_KEY = "key"
OPENAI_API_KEY = "key"

In [46]:
import os
import tiktoken
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Initialize OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def chunk_text(text, max_tokens=8000):
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens):
        chunk = encoding.decode(tokens[i:i + max_tokens])
        chunks.append(chunk)
    
    return chunks

# Create or get index
index_name = "f1-regulations"
if index_name not in [idx.name for idx in pc.list_indexes()]:
    try:
        pc.create_index(
            name=index_name,
            dimension=1536,  # Dimension for text-embedding-ada-002
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        print(f"Index {index_name} created successfully")
    except Exception as e:
        print(f"Failed to create index: {str(e)}")
        raise

# Retrieve index instance
index = pc.Index(index_name)

# Process documents
for idx in range(1, 4):
    text_filename = f"document_{idx}.txt"
    if os.path.exists(text_filename):
        try:
            with open(text_filename, 'r') as text_file:
                text = text_file.read()
            
            chunks = chunk_text(text)
            
            for chunk_idx, chunk in enumerate(chunks):
                response = client.embeddings.create(input=chunk, model="text-embedding-ada-002")
                embedding = response.data[0].embedding
                
                # Create a unique ID for each chunk
                chunk_id = f"{text_filename}_chunk_{chunk_idx}"
                
                # Upsert the embedding to Pinecone
                index.upsert([(chunk_id, embedding, {"source": text_filename})])
            
            print(f"Processed {text_filename} in {len(chunks)} chunks")
        except Exception as e:
            print(f"Error processing {text_filename}: {str(e)}")
    else:
        print(f"{text_filename} not found")

print("Processing complete.")

Index f1-regulations created successfully
Processed document_1.txt in 8 chunks
Processed document_2.txt in 21 chunks
Processed document_3.txt in 21 chunks
Processing complete.
