# <h1 align="center"><font color="gree">Building reliable apps with GPT-4o and structured outputs</font></h1>

<font color="pink">Senior Data Scientist.: Dr. Eddy Giusepe Chirinos Isidro</font>

Link de estudo:

* [Weights & Biases](https://wandb.ai/byyoung3/ML_NEWS3/reports/Building-reliable-apps-with-GPT-4o-and-structured-outputs--Vmlldzo5NjM3MDU5?utm_source=send&utm_medium=email&utm_campaign=weave_newsletter&mkt_tok=MjYxLVFIUC04MjIAAAGWVTi_9ukc1NwoPluktU9oVF5Gheg-LxEy_B2xoCs4t_i8_wZVyi4J7kaogJxOF8BM9X2-cn1sxz5WnzLqtNXO-Vv5dgYzps2hmSj6NFOL)

In [1]:
import os
import json
import arxiv
import shutil
from PyPDF2 import PdfReader
from openai import OpenAI
import weave

#from openai import OpenAI
#client = OpenAI(api_key=Eddy_key_openai)

# Initialize Weave and OpenAI
weave.init("paper_classification")

import openai
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
#openai.api_key  = os.environ['OPENAI_API_KEY']
Eddy_key_openai = os.environ['OPENAI_API_KEY']

weave version 0.51.14 has been recalled!  (Evaluation framework is mis-reporting the inputs for prediction records)  Please upgrade.
Logged in as Weights & Biases user: eddygiusepe.
View Weave data at https://wandb.ai/eddygiusepe/paper_classification/weave


In [2]:
model = "gpt-4o-2024-08-06" #"gpt-4o-mini" # gpt-4o-2024-08-06
client = OpenAI(api_key=Eddy_key_openai)


# Directory to download and categorize papers
download_dir = "./arxiv_papers"
if not os.path.exists(download_dir):
    os.makedirs(download_dir)


In [22]:
# List of machine learning categories
categories = [
    "Supervised Learning", "Unsupervised Learning", "Reinforcement Learning", "Deep Learning", 
    "Natural Language Processing", "Computer Vision", "Graph Neural Networks", "Transfer Learning", 
    "Meta-Learning", "Few-Shot Learning", "Self-Supervised Learning", "Representation Learning", 
    "Multi-Modal Learning", "Generative Adversarial Networks (GANs)", "Bayesian Methods", 
    "Probabilistic Models", "Federated Learning", "Privacy-Preserving ML", "Fairness and Bias in ML", 
    "Explainable AI", "Optimization Algorithms", "Adversarial Robustness", "Causal Inference", 
    "Anomaly Detection", "Time Series Analysis", "Graph-Based Learning", "Knowledge Graphs", 
    "Ontology Learning", "Recommender Systems", "Information Retrieval", "Domain Adaptation", 
    "Semi-Supervised Learning", "Data Augmentation Techniques", "Multi-Agent Systems", 
    "Human-in-the-Loop Learning", "Curriculum Learning", "Active Learning", "Imitation Learning", 
    "Inverse Reinforcement Learning", "Policy Optimization", "Robustness to Distribution Shifts", 
    "Neural Architecture Search (NAS)", "Hyperparameter Optimization", "Neurosymbolic AI", 
    "Neural Ordinary Differential Equations", "Memory-Augmented Networks", "Recurrent Neural Networks (RNNs)", 
    "Long Short-Term Memory (LSTM)", "Transformer Models", "Attention Mechanisms", 
    "Pre-trained Language Models (e.g., BERT, GPT)", "Contrastive Learning", "Energy-Based Models", 
    "Neural Style Transfer", "Object Detection", "Segmentation Models", "Image Generation", "3D Vision", 
    "Motion Prediction", "Speech Recognition", "Speech Synthesis", "Emotion Recognition", 
    "Text Generation", "Summarization", "Machine Translation", "Question Answering", "Dialogue Systems", 
    "Conversational AI", "Autonomous Systems", "Robotics and Control", "Game Theory in ML", 
    "Synthetic Data Generation", "Biomedical Data Analysis", "Bioinformatics", "Healthcare Applications of ML", 
    "Drug Discovery", "Predictive Maintenance", "Financial Modeling", "Climate Modeling", 
    "Physics-Informed Learning", "Chemistry Applications", "Material Science Applications", 
    "Social Network Analysis", "Sentiment Analysis", "Text Mining", "Data Mining", "Complex Systems", 
    "Ensemble Methods", "Evolutionary Algorithms", "Quantum Machine Learning", "ML System Performance Optimization", 
    "ML in Edge Computing", "ML for Internet of Things (IoT)", "Multi-Task Learning", "Continual Learning", 
    "Neural-Symbolic Learning", "Vision-Language Models", "Zero-Shot Learning", "Learning from Demonstration", 
    "Neural Network Pruning"
]


In [4]:
# Define a function to read the first 1000 characters of a PDF
def read_pdf_first_1000_chars(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
                if len(text) >= 1000:
                    return text[:1000]
    except Exception as e:
        print(f"Failed to read {pdf_path}: {e}")
    return ""


In [5]:
# Define a function to categorize a paper based on its content using structured output
@weave.op
def categorize_paper(text):
    # Define the JSON schema for structured output with enum categories (not required but helpful) 
    category_schema = {
        "type": "json_schema",
        "json_schema": {
            "name": "paper_category_response",
            "schema": {
                "type": "object",
                "properties": {
                    "category": {
                        "type": "string",
                        "enum": categories,  # Use the list of categories as enum options
                        "description": "The category of the research paper"
                    }
                },
                "required": ["category"],  # Ensure that the response contains a category
                "additionalProperties": False,
                "strict": True
            }
        }
    }


    # Create the prompt for categorizing the text
    prompt = f"""
    Com base no texto a seguir de um artigo de pesquisa, categorize-o em um dos seguintes tópicos de aprendizado de máquina: {', '.join(categories)}.
    Por favor, responda com um objeto JSON no formato: {{"category": "Category Name"}}.


    Research Paper Content:
    {text}
    """
    
    # Make the API request to categorize the text using structured output
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Você é um assistente de categorização."},
            {"role": "user", "content": prompt}
        ],
        response_format=category_schema,  # Use structured output format with enum
        max_tokens=50,
        temperature=0.3
    )
    
    # Parse the model's response to extract the category
    result = response.choices[0].message.content.strip()
    try:
        result_json = json.loads(result)
        category = result_json.get("category", "Uncategorized")
    except json.JSONDecodeError:
        category = "Uncategorized"


    return category


In [6]:
# Define a function to move the PDF to the appropriate category folder
def move_pdf_to_category(pdf_path, category):
    category_dir = os.path.join(download_dir, category.replace(" ", "_"))
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    shutil.move(pdf_path, os.path.join(category_dir, os.path.basename(pdf_path)))
    print(f"Moved {pdf_path} to {category_dir}")


In [9]:
# Download recent papers from arXiv
query = "Paper sobre Atenção é tudo o que precisamos"
max_results = 1  # Change to a larger number as needed
search = arxiv.Search(
    query=query,
    max_results=max_results,
    sort_by=arxiv.SortCriterion.SubmittedDate
)


In [10]:
# Iterate through each result and categorize the paper
for result in search.results():
    print(f"Downloading: {result.title}")
    paper_id = result.entry_id.split('/')[-1]
    pdf_url = result.pdf_url
    filename = f"{paper_id}.pdf"
    result.download_pdf(dirpath=download_dir, filename=filename)
    
    # Read the first 100 characters of the downloaded PDF
    pdf_path = os.path.join(download_dir, filename)
    text_snippet = read_pdf_first_1000_chars(pdf_path)
    
    if text_snippet:
        print(f"Categorizing paper: {filename}")
        # Use the categorize_paper function to get the category
        category = categorize_paper(text_snippet)
        print(f"Assigned Category: {category}")
        
        # Move the PDF to the appropriate category folder
        move_pdf_to_category(pdf_path, category)
    else:
        print(f"Failed to extract text from {filename}")


  for result in search.results():


Downloading: Detection of Undeclared EV Charging Events in a Green Energy Certification Scheme
Categorizing paper: 2410.18971v1.pdf
🍩 https://wandb.ai/eddygiusepe/paper_classification/r/call/0192cf0c-452f-7cd1-95fc-a4f32b93823f
Assigned Category: Supervised Learning
Moved ./arxiv_papers/2410.18971v1.pdf to ./arxiv_papers/Supervised_Learning
