In [44]:
# ## 📦 Step 1: Import Libraries
import os
from openai import OpenAI
import PyPDF2
from dotenv import load_dotenv

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


In [45]:
# ## 🔑 Step 2: Load API Key

# Load environment variables from .env file
load_dotenv()

# Get API key
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    print("❌ Error: OPENAI_API_KEY not found!")
    print("Please create a .env file with: OPENAI_API_KEY=sk-your-key-here")
else:
    print(f"✅ API Key loaded: {api_key[:10]}...{api_key[-4:]}")

# Initialize OpenAI client
client = OpenAI(api_key=api_key)
print("✅ OpenAI client initialized!")


✅ API Key loaded: sk-proj-Ld...WEEA
✅ OpenAI client initialized!


In [46]:
import os
import PyPDF2

# ## 📖 Step 3: Define PDF Text Extraction Function

def extract_text_from_pdf(pdf_path="test.pdf"):
    """
    Extract text from a PDF file located in the current directory.
    
    Args:
        pdf_path (str): Path to the PDF file (default: 'test.pdf')
        
    Returns:
        str: Extracted text from the PDF
    """
    # Check if file exists in current directory
    if not os.path.isfile(pdf_path):
        print(f"❌ Error: File '{pdf_path}' not found in current directory: {os.getcwd()}")
        return None
    
    print(f"📄 Reading PDF: {pdf_path}")
    text = ""
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            print(f"📖 Found {num_pages} pages")
            
            for i, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text() or ""
                text += page_text
                print(f"✓ Extracted page {i+1}/{num_pages}")
                
    except Exception as e:
        print(f"❌ Error extracting PDF text: {str(e)}")
        return None
    
    if not text.strip():
        print("⚠️  Warning: No text could be extracted from the PDF")
        return None
        
    print(f"✅ Successfully extracted {len(text)} characters\n")
    return text


# Example call
print("✅ Function defined: extract_text_from_pdf()")

# Run the extraction automatically for test.pdf
pdf_text = extract_text_from_pdf()

if pdf_text:
    print("\n✅ Text extraction complete!")
    print("First 500 characters:\n")
    print(pdf_text[:500] + ("..." if len(pdf_text) > 500 else ""))
else:
    print("❌ Failed to extract text from test.pdf")


✅ Function defined: extract_text_from_pdf()
📄 Reading PDF: test.pdf
📖 Found 2 pages
✓ Extracted page 1/2
✓ Extracted page 2/2
✅ Successfully extracted 10127 characters


✅ Text extraction complete!
First 500 characters:

 
CONFIDENTIAL  20.08.2025  
PROJECT NAME: ReUse Prototypes  
 
IDEA DESCRIPTION  
Briefly describe your idea to give a broad 
overview of the concept.  RE:USE  Prototypes is a project submitted by ZI -1 as part of the "Circular Business Models" challenge and deemed valuable by the 
EMC CE circle.  
The implementation of the project allows for the reuse of components before the development vehicle is scrapped, reducing the  
purchase of new parts. This saves costs, materials, and procurement tim...


In [47]:


# ## 🤖 Step 4: Define OpenAI Processing Function (WITH VOLUME)

def process_with_openai(text, model="gpt-4o-mini", max_chars=8000):
    """
    Send extracted text to OpenAI for BMW Market Potential Hackathon context.
    Now includes VOLUME analysis alongside TAM/SAM/SOM.
    
    Args:
        text (str): The extracted text to process
        model (str): OpenAI model to use
        max_chars (int): Maximum characters to send to API
        
    Returns:
        str: AI-generated market size + volume analysis as JSON
    """
    print(f"🤖 Sending to OpenAI ({model})...")

    if len(text) > max_chars:
        print(f"⚠️  Text truncated from {len(text)} to {max_chars} characters")
        text = text[:max_chars]

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are an excellent, meticulous AI business analyst for BMW's corporate innovation team. "
                        "You excel at market sizing (TAM/SAM/SOM analysis) with data-driven insights. You may search the web "
                        "or deduce missing data when needed. Your output MUST be valid JSON (no markdown, no commentary). "
                        "When supplying numbers, provide sources (links) where available or list clear assumptions if deduced."
                    )
                },
                {
                    "role": "user",
                    "content": (
                        "Analyze this project and output a valid JSON object with TAM/SAM/SOM and VOLUME analysis. "
                        "The required schema:\n\n"
                        "{\n"
                        "  \"TAM\": {\n"
                        "    \"description_of_public\": string,  // e.g. 'all women aged 25-44 who commute by motorcycle'\n"
                        "    \"numbers\": {\n"
                        "      \"2024\": number,\n"
                        "      \"2025\": number,\n"
                        "      \"2026\": number,\n"
                        "      \"2027\": number,\n"
                        "      \"2028\": number,\n"
                        "      \"2029\": number,\n"
                        "      \"2030\": number\n"
                        "    },\n"
                        "    \"justification\": string,  // Include numerical reasoning, sources, links, assumptions\n"
                        "    \"industry_example\": {\n"
                        "      \"name\": string,\n"
                        "      \"description\": string,\n"
                        "      \"link\": string\n"
                        "    }\n"
                        "  },\n"
                        "  \"SAM\": {  // Same structure as TAM\n"
                        "    \"description_of_public\": string,  // Subset of TAM that you can realistically reach\n"
                        "    \"numbers\": { \"2024\" through \"2030\" },\n"
                        "    \"justification\": string,\n"
                        "    \"industry_example\": { \"name\", \"description\", \"link\" }\n"
                        "  },\n"
                        "  \"SOM\": {  // Same structure as TAM\n"
                        "    \"description_of_public\": string,  // Subset of SAM you can capture in first few years\n"
                        "    \"numbers\": { \"2024\" through \"2030\" },\n"
                        "    \"justification\": string,\n"
                        "    \"industry_example\": { \"name\", \"description\", \"link\" }\n"
                        "  },\n"
                        "  \"VOLUME\": {\n"
                        "    \"description\": string,  // What is being sold/produced (units, bundles, components, etc.)\n"
                        "    \"numbers\": {\n"
                        "      \"2024\": number,\n"
                        "      \"2025\": number,\n"
                        "      \"2026\": number,\n"
                        "      \"2027\": number,\n"
                        "      \"2028\": number,\n"
                        "      \"2029\": number,\n"
                        "      \"2030\": number\n"
                        "    },\n"
                        "    \"source\": string,  // 'extracted from document' OR 'calculated as 50% of SOM' OR 'industry benchmark'\n"
                        "    \"justification\": string,  // Explain how volume was determined\n"
                        "    \"competitor_benchmark\": {\n"
                        "      \"name\": string,  // Competitor or similar case\n"
                        "      \"volume\": number,  // Their comparable volume\n"
                        "      \"description\": string,\n"
                        "      \"link\": string\n"
                        "    }\n"
                        "  },\n"
                        "  \"sources\": [string]  // Array of all URLs referenced\n"
                        "}\n\n"
                        "Requirements:\n"
                        "1. All numbers must be integers representing number of target people or units\n"
                        "2. Each justification must include numerical reasoning, sources, links\n"
                        "3. Industry examples should be real, comparable cases with links\n"
                        "4. SAM must be a logical subset of TAM, SOM must be subset of SAM\n"
                        "5. VOLUME logic:\n"
                        "   - First, check if the document explicitly mentions production/sales volume targets\n"
                        "   - If found, use those numbers and cite 'extracted from document'\n"
                        "   - If NOT found, calculate as 50% of SOM numbers and cite 'calculated as 50% of SOM (conservative estimate)'\n"
                        "   - Always provide a competitor benchmark with actual volume data from similar products/projects\n"
                        "6. Output strict JSON only, no markdown or other text\n\n"
                        f"Document to analyze:\n{text}\n\n"
                    )
                }
            ],
            max_tokens=1200,  # Increased token limit to accommodate volume section
            temperature=0.6
        )

        summary = response.choices[0].message.content
        print("✅ Market sizing + volume analysis generated successfully!\n")
        return summary

    except Exception as e:
        print(f"❌ Error processing with OpenAI: {str(e)}")
        return None


print("✅ Function updated: process_with_openai() with VOLUME analysis")


✅ Function updated: process_with_openai() with VOLUME analysis


In [48]:
# ## 🚀 Step 5: Process Your PDF

# ⚠️ CHANGE THIS to your PDF file path
pdf_file = "test.pdf"  # <-- Edit this line!

print("="*60)
print("📄 PDF AI PROCESSOR")
print("="*60 + "\n")

# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_file)

if extracted_text:
    print("✅ Text extraction successful! Proceeding to AI processing...\n")
else:
    print("❌ Failed to extract text. Please check your PDF file.")


📄 PDF AI PROCESSOR

📄 Reading PDF: test.pdf
📖 Found 2 pages
✓ Extracted page 1/2
✓ Extracted page 2/2
✅ Successfully extracted 10127 characters

✅ Text extraction successful! Proceeding to AI processing...



In [49]:
# ## 🤖 Step 6: Get AI Summary

if extracted_text:
    ai_summary = process_with_openai(extracted_text)
    
    if ai_summary:
        print("="*60)
        print("🤖 AI SUMMARY")
        print("="*60)
        print(ai_summary)
        print("\n" + "="*60)
    else:
        print("❌ Failed to get AI summary")
else:
    print("⚠️ No text available to process")


🤖 Sending to OpenAI (gpt-4o-mini)...
⚠️  Text truncated from 10127 to 8000 characters
✅ Market sizing + volume analysis generated successfully!

🤖 AI SUMMARY
{
  "TAM": {
    "description_of_public": "all development vehicles in the BMW Group fleet, approximately 8,000 vehicles per year",
    "numbers": {
      "2024": 8000,
      "2025": 8000,
      "2026": 8000,
      "2027": 8000,
      "2028": 8000,
      "2029": 8000,
      "2030": 8000
    },
    "justification": "The total addressable market (TAM) is defined by the fleet of development vehicles at BMW, which is approximately 8,000 vehicles annually. This figure is based on the project documentation that specifies the development fleet size. (Assumption: No growth in fleet size over the next few years).",
    "industry_example": {
      "name": "Volvo Cars - Circular Business Model",
      "description": "Volvo Cars has implemented a circular business model focused on reusing parts and reducing waste in their manufacturing proces

In [50]:
# ## 📋 Step 7: View Extracted Text Preview

if extracted_text:
    print("="*60)
    print("📋 EXTRACTED TEXT PREVIEW (first 1000 characters)")
    print("="*60)
    print(extracted_text[:1000] + ("..." if len(extracted_text) > 1000 else ""))
    print("\n" + "="*60)
    print(f"Total characters extracted: {len(extracted_text)}")
    print("="*60)
else:
    print("⚠️ No text available to display")


📋 EXTRACTED TEXT PREVIEW (first 1000 characters)
 
CONFIDENTIAL  20.08.2025  
PROJECT NAME: ReUse Prototypes  
 
IDEA DESCRIPTION  
Briefly describe your idea to give a broad 
overview of the concept.  RE:USE  Prototypes is a project submitted by ZI -1 as part of the "Circular Business Models" challenge and deemed valuable by the 
EMC CE circle.  
The implementation of the project allows for the reuse of components before the development vehicle is scrapped, reducing the  
purchase of new parts. This saves costs, materials, and procurement time for our development vehicles. The development fleet 
consi sts of approximately 8,000 vehicles.  
Currently, some cycles are already present at EA (fire extinguishers, DME, charging cables, measuring equipment). The project 's 
goal is to further establish these cycles and expand them to other departments. There are 4 streams evaluated by the project team. 
 
Stream 1: Expansion of the accessory cycle process  
- Currently, there is an establish

In [51]:
# ## 💾 Optional: Save Results to File

if extracted_text and ai_summary:
    output_file = "ai_summary.txt"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("="*60 + "\n")
        f.write("AI SUMMARY\n")
        f.write("="*60 + "\n\n")
        f.write(ai_summary)
        f.write("\n\n" + "="*60 + "\n")
        f.write("EXTRACTED TEXT\n")
        f.write("="*60 + "\n\n")
        f.write(extracted_text)
    
    print(f"✅ Results saved to: {output_file}")
else:
    print("⚠️ No results to save")


✅ Results saved to: ai_summary.txt


In [52]:
# ## ⚙️ Advanced Example (Optional Custom Analysis)

if extracted_text:
    custom_prompt = "Extract the main findings and conclusions from this document:"
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Change to "gpt-4-turbo" for better quality
        messages=[
            {"role": "system", "content": "You are an expert research analyst."},
            {"role": "user", "content": f"{custom_prompt}\n\n{extracted_text[:8000]}"}
        ],
        max_tokens=800,
        temperature=0.5
    )
    
    print("🔍 Custom Analysis:")
    print(response.choices[0].message.content)


🔍 Custom Analysis:
### Main Findings and Conclusions from the Document:

#### Project Overview:
- **Project Name:** ReUse Prototypes, aimed at enhancing circular business models within the BMW Group.
- **Objective:** To implement a project that allows for the reuse of vehicle components before scrapping, thereby reducing costs, material usage, and procurement times for a fleet of approximately 8,000 vehicles.

#### Key Streams of the Project:
1. **Expansion of the Accessory Cycle Process:**
   - Current processes exist for fire extinguishers, charging cables, and measuring equipment.
   - The goal is to identify and include more accessories in this cycle.

2. **Demand-Driven Disassembly:**
   - Lack of transparency regarding vehicle components scheduled for recycling.
   - Aim to create an automated process for better transparency and utilization of components, including those from upgrades and retrofits.

3. **Disposal Optimization of Storage Space:**
   - Over €3 million worth of new