# Installing Dependencies

In [13]:
pip install pdf2image

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
pip install google-generativeai PyPDF2 python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Event Details

In [211]:
import google.generativeai as genai
from pdf2image import convert_from_path
from PIL import Image
import os

# 1. Configure Gemini API
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Extract pages of PDF as a PIL image
def get_page_image(pdf_path, page_num):
    images = convert_from_path(
        pdf_path,
        first_page=page_num,
        last_page=page_num,
        poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]  # Return PIL.Image.Image object
    return None

# 3. Run OCR using Gemini 2.0 Flash
def ocr_with_gemini(pil_image, query):
    model = genai.GenerativeModel("gemma-3-4b-it")
    response = model.generate_content([
        query,
        pil_image
    ])
    return response.text.strip()

# 4. Prompt builder for event details
def build_prompt(first_page_text, industry_name):
    return f"""
You are given text from the first page of a PDF (event details) and an extracted "Industry Name".

Rules:
1. "Event name" – Use the event name , after removing first word and last word year.
2. "Event code" – Use the event code exactly as shown.
3. "Event Tagline" – Use the full event name except year .
3. "Event Dates" – Format as "Month, Date1 - Date2, YYYY".
4. "Event Location" – Full location (City, State/Region, Country).
5. "Event year" – 4-digit year from the event date.
6. "Event Currency" – Based on country (USA →  USD, Canada → CAD, Eurozone → EUR).
7. "Event Short Dates" – Format as "Month(In short form), Date1 - Date2, YYYY".
8. "Event Short Location" – Abbreviated form of Event Location contain only state (remove city and country if present, keep state short form).
9. "Event Color Name" – first word of the event name.
10. "Event City Shortcode" – First three letters of the city in uppercase.
11. "Event Postponed" – False unless stated otherwise.
12. Add an extra field called "Industry Name" – value is "{industry_name}".
13. "Previous Agenda" – True unless stated otherwise.
14. "Hubspot Disposition" – Format: disposition_<EventCode in lowercase>_<EventYear>
15. "Hubspot Email Status" – Format: email_status_<EventCode in lowercase>_<EventYear>
16. "Custom Currency Symbol" – leave it blank.
17. "Currency Position" – Always "Top left".


JSON template:
{{
  "Event name": "",  
  "Event code": "",  
  "Event Tagline": "",
  "Event Dates": "",  
  "Event Location": "",  
  "Event year": "",  
  "Event Currency": "", 
  "Event Short Dates": "", 
  "Event Short Location": "", 
  "Event Color Name": "", 
  "Event City Shortcode": "",  
  "Event Postponed": false,  
  "Industry Name": "",
  "Previous Agenda": false,  
  "Hubspot Disposition": "",  
  "Hubspot Email Status": "",  
  "Custom Currency Symbol": "",
  "Currency Position": "Top left"
}}

First page text:
---
{first_page_text}
---

Now return ONLY valid JSON with the filled details.
"""

# 5. Main
if __name__ == "__main__":
    pdf_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"

    # Extract first page and second-last page
    first_page_img = get_page_image(pdf_path, 1)
    industry_page_img = get_page_image(pdf_path, 20)  

    if first_page_img and industry_page_img:
        # Extract text from first page
        first_page_text = ocr_with_gemini(
            first_page_img,
            "Extract all the text exactly as shown from this page."
        )

        # Extract industry name from second-last page
        industry_name = ocr_with_gemini(
            industry_page_img,
            "From the provided form image, identify the industry option that is marked or checked. Return only the industry name (e.g., 'Clean Energy')."
        )

        # Build prompt including industry name
        prompt = build_prompt(first_page_text, industry_name)

        # Run Gemini for final structured JSON
        model = genai.GenerativeModel("gemma-3-4b-it")
        response = model.generate_content([prompt])

        # Save output JSON
        with open("gemini_response.json", "w", encoding="utf-8") as f:
            f.write(response.text)

        print("✅ JSON saved to gemini_response.json")
        print(response.text)

    else:
        print("❌ Could not extract images from PDF.")


✅ JSON saved to gemini_response.json
```json
{
  "Event name": "Enormous Geothermal Systems to 2026",
  "Event code": "AFS",
  "Event Tagline": "Enormous Geothermal Systems to 2026",
  "Event Dates": "March 11 - March 23, 2026",
  "Event Location": "Los Angeles, California, USA",
  "Event year": "2026",
  "Event Currency": "USD",
  "Event Short Dates": "Mar - Mar, 2026",
  "Event Short Location": "CA",
  "Event Color Name": "Enormous",
  "Event City Shortcode": "LA",
  "Event Postponed": false,
  "Industry Name": "Clean Energy",
  "Previous Agenda": false,
  "Hubspot Disposition": "disposition_afs_2026",
  "Hubspot Email Status": "email_afs_2026",
  "Custom Currency Symbol": "",
  "Currency Position": "Top left"
}
```


In [212]:
import json
import os

# Input file path
input_file = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\gemini_response.json"

# Load JSON from file
with open(input_file, "r", encoding="utf-8") as f:
    raw_content = f.read().strip()
clean_content = re.sub(r"^```[a-zA-Z]*\n", "", raw_content)
clean_content = re.sub(r"\n```$", "", clean_content)
event_data = json.loads(clean_content)
# Map event_data keys to payload options
# Format: (json_key, option_name, label)
mapping = [
    ("Event name", "event_name", "Event Name"),
    ("Event code", "event_code", "Event Code"),
    ("Event Tagline", "event_tagline", "Event Tagline"),
    ("Event Dates", "event_dates", "Event Dates"),
    ("Event Location", "event_Location", "Event Location"),
    ("Event year", "event_year", "Event Year"),
    ("Event Currency", "event_currency", "Event Currency"),
    ("Event Short Dates", "event_short_date", "Event Short Date"),
    ("Event Short Location", "event_short_location", "Event Short Location"),
    ("Event Color Name", "event_color_name", "Event Color Name"),
    ("Event City Shortcode", "event_city_shortcode", "Event City Shortcode"),
    ("Event Postponed", "event_postponed", "Event Postponed"),
    ("Industry Name", "industry_name", "Industry Name"),
    ("Previous Agenda", "previous_agenda", "Previous Agenda"),
    ("Hubspot Disposition", "hubspot_disposition", "Hubspot Disposition"),
    ("Hubspot Email Status", "hubspot_email_status", "Hubspot Email Status"),
    ("Custom Currency Symbol", "custom_currency_symbol", "Custom Currency Symbol"),
    ("Currency Position", "currency_postion", "Currency Postion"),
]

# Build payload
payload = {"options": []}

for idx, (json_key, option, label) in enumerate(mapping, start=1):
    value = event_data.get(json_key, None)

    # Convert booleans to lowercase strings to match your sample payload
    if isinstance(value, bool):
        value = str(value).lower()

    payload["options"].append({
        "id": idx,
        "option": option,
        "value": value,
        "label": label
    })

# Output result
output_file = os.path.splitext(input_file)[0] + "_payload.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2, ensure_ascii=False)

print(f"✅ Payload saved to {output_file}")


✅ Payload saved to C:\Users\ali.zain\Desktop\Content_Extraction\research\gemini_response_payload.json


In [213]:
import requests
import json

# API endpoint
url = "https://ai-demo.genetechz.com/api/event-details"

# JSON payload (from file)
with open("gemini_response_payload.json", "r", encoding="utf-8") as f:
    payload = json.load(f)

# Make POST request
try:
    response = requests.post(url, json=payload, timeout=30)  # use `json=` instead of `data=`
    response.raise_for_status()  # raise error for 4xx/5xx responses

    # If response is JSON
    try:
        print("✅ Response JSON:", json.dumps(response.json(), indent=2))
    except json.JSONDecodeError:
        print("⚠️ Response is not valid JSON. Raw text:")
        print(response.text)

except requests.exceptions.RequestException as e:
    print("❌ Request failed:", e)


✅ Response JSON: [
  {
    "id": 1,
    "option": "event_name",
    "value": "Enormous Geothermal Systems to 2026",
    "label": "Event Name",
    "createdAt": "2025-08-25T10:31:46.734Z",
    "updatedAt": "2025-08-25T10:31:46.734Z"
  },
  {
    "id": 2,
    "option": "event_code",
    "value": "AFS",
    "label": "Event Code",
    "createdAt": "2025-08-25T10:31:46.734Z",
    "updatedAt": "2025-08-25T10:31:46.734Z"
  },
  {
    "id": 3,
    "option": "event_tagline",
    "value": "Enormous Geothermal Systems to 2026",
    "label": "Event Tagline",
    "createdAt": "2025-08-25T10:31:46.734Z",
    "updatedAt": "2025-08-25T10:31:46.734Z"
  },
  {
    "id": 4,
    "option": "event_dates",
    "value": "March 11 - March 23, 2026",
    "label": "Event Dates",
    "createdAt": "2025-08-25T10:31:46.734Z",
    "updatedAt": "2025-08-25T10:31:46.734Z"
  },
  {
    "id": 5,
    "option": "event_Location",
    "value": "Los Angeles, California, USA",
    "label": "Event Location",
    "createdAt": "

# Page Home

In [217]:
def build_prompt():
    return f"""
You are an expert information extraction AI. Given the OCR text of a webpage, identify and extract the heading and descriptive text associated with the section featuring a video element. The heading and descriptive text are presented near the following lines:
Event Logo
EVENT DETAILS SPEAKERS
SPONSORS VENUE MEDIA CONTACT US
REGISTER
and a video element.
Return the extracted heading and description in JSON format, where the keys are 'heading' and 'description', and the values are the corresponding text strings from the document. Ensure the 'description' value includes all text intended to describe the video section, presented in a concise and readable manner with original linebreaks.
If a video section cannot be confidently identified, return an empty JSON object: 
"""


In [221]:
from PIL import Image
import os
import google.generativeai as genai

def resize_image(pil_image, max_size=2000, save_path="resized_page.png"):
    """Resize image to max_size while keeping aspect ratio. Save for inspection."""
    width, height = pil_image.size
    if max(width, height) > max_size:
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
        pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
    # Save to disk so you can verify what Gemini sees
    pil_image.save(save_path)
    print(f"📸 Resized image saved at {os.path.abspath(save_path)} ({pil_image.size[0]}x{pil_image.size[1]})")
    return pil_image

def ocr_with_gemini(pil_image):
    model = genai.GenerativeModel("gemini-2.0-flash")
    query = build_prompt()

    # Resize before sending
    safe_img = resize_image(pil_image)

    response = model.generate_content([query, safe_img])
    return response.text


In [222]:
import google.generativeai as genai
from pdf2image import convert_from_path
from PIL import Image
import os

# 1. Configure Gemini API
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Extract pages of PDF as a PIL image
def get_page_image(pdf_path, page_num):
    images = convert_from_path(
        pdf_path,
        first_page=page_num,
        last_page=page_num,
        poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]  # Return PIL.Image.Image object
    return None

# 3. Prompt builder for event details

# 5. Main
if __name__ == "__main__":
    pdf_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"

    first_page_img = get_page_image(pdf_path, 2)

    if first_page_img :
        # Extract text from first page
        first_page_text = ocr_with_gemini(
            first_page_img
            # "From the provided form image, identify the industry option that is marked or checked. Return only the industry name (e.g., 'Clean Energy')."
        )
        print("Extracted text from first page image:\n")
        
        # Save output JSON
        with open("Page_home.json", "w", encoding="utf-8") as f:
            f.write(first_page_text)

        print("✅ JSON saved to gemini_response.json")
        print(first_page_text)
    # else:
    #     print("❌ Could not extract images from PDF.")



📸 Resized image saved at c:\Users\ali.zain\Desktop\Content_Extraction\research\resized_page.png (450x2000)
Extracted text from first page image:

✅ JSON saved to gemini_response.json
```json
{
  "heading": "ADVANCING TECHNOLOGIES IN THE GEOTHERMAL ENERGY\nSECTOR",
  "description": "Welcome to Enhanced Geothermal Systems 2026, where\nindustry leaders will examine the potential of next-generation\ngeothermal power to transform the U.S. energy landscape.\n\nThe energy sector faces challenges with conventional\ngeothermal systems, including limited hydrothermal\nresources, drilling technologies, production declines, and\nheat-to-electricity conversion limitations. This event will\nspotlight enhanced geothermal systems, focusing on\nhydraulic, chemical, thermal, and explosive stimulation\nmethods that support innovation in energy production.\n\nJoin us to explore advanced solutions, connect with industry\nexperts, and contribute to the future of geothermal energy"
}
```


In [223]:
import json
import os
import requests

# Path to your JSON file
input_file = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\Page_home.json"

# Read file as text
with open(input_file, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Remove markdown fences if present
if raw_text.startswith("```"):
    raw_text = raw_text.strip("`")
    if raw_text.lower().startswith("json"):
        raw_text = raw_text[4:].strip()

# Load as JSON
data = json.loads(raw_text)

# Extract values
heading = data.get("heading", "")
description = data.get("description", "")

# Keep formatting:
# - double newlines → new paragraphs
# - single newline → line breaks
paragraphs = description.split("\n\n")

description_html = "".join(
    f"<p>{p_clean}</p>"
    for p in paragraphs if (p_clean := p.replace("\n", "<br>").strip())
)

# Example video embed
video_html = '<figure class="media"><oembed url="https://player.vimeo.com/video/236701630"></oembed></figure>'

# Build payload
payload = {
    "options": [
        {
            "id": 1,
            "option": "heading",
            "value": heading,
            "label": "Heading"
        },
        {
            "id": 2,
            "option": "paragraph",
            "value": description_html,
            "label": "Paragraph"
        },
        {
            "id": 3,
            "option": "Video",
            "value": video_html,
            "label": "Video Link"
        }
    ]
}

# Save payload
output_file = os.path.splitext(input_file)[0] + "_payload.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2, ensure_ascii=False)

print(f"✅ Payload saved to {output_file}")

# --- Post request (optional) ---
url = "https://ai-demo.genetechz.com/api/home-page"
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, data=json.dumps(payload))

print("Status Code:", response.status_code)
try:
    print("Response JSON:", response.json())
except:
    print("Response Text:", response.text)


✅ Payload saved to C:\Users\ali.zain\Desktop\Content_Extraction\research\Page_home_payload.json
Status Code: 200
Response JSON: [{'id': 1, 'option': 'heading', 'value': 'ADVANCING TECHNOLOGIES IN THE GEOTHERMAL ENERGY\nSECTOR', 'label': 'Heading', 'createdAt': '2025-08-25T10:48:28.040Z', 'updatedAt': '2025-08-25T10:48:28.040Z'}, {'id': 2, 'option': 'paragraph', 'value': '<p>Welcome to Enhanced Geothermal Systems 2026, where<br>industry leaders will examine the potential of next-generation<br>geothermal power to transform the U.S. energy landscape.</p><p>The energy sector faces challenges with conventional<br>geothermal systems, including limited hydrothermal<br>resources, drilling technologies, production declines, and<br>heat-to-electricity conversion limitations. This event will<br>spotlight enhanced geothermal systems, focusing on<br>hydraulic, chemical, thermal, and explosive stimulation<br>methods that support innovation in energy production.</p><p>Join us to explore advanced solu

# Key Points


In [81]:
from PyPDF2 import PdfReader
import json

# Load PDF and extract fields
reader = PdfReader(r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf")
fields = reader.get_fields()

# Filter fields that start with "Text" and have numeric suffix
text_fields = {}
for name, data in fields.items():
    if name.startswith("Text"):
        try:
            num = int("".join(filter(str.isdigit, name)))
            text_fields[num] = data.get("/V")
        except ValueError:
            pass  # skip if not a valid number

results = []

# Loop through expected range
for i in range(19, 31):
    value = text_fields.get(i, None)
    if i % 2 == 1:  # odd -> Title
        current_item = {"Title": value, "Description": None}
        results.append(current_item)
    else:  # even -> Description
        if results:  # assign description to last item
            results[-1]["Description"] = value

# Wrap results inside "key_topics"
output = {"key_topics": results}

# Save to JSON file
with open("output.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print("Saved to output.json")


Saved to output.json


In [82]:
import requests
import json

# API base URL for updates
base_url = "https://ai-demo.genetechz.com/api/key-topics/update"

# Payload with new data
with open("output.json", "r") as f:
    payload = json.load(f)

headers = {
    "Content-Type": "application/json"
}

# Loop through each topic and update by ID (1–6)
for i, topic in enumerate(payload["key_topics"], start=1):
    url = f"{base_url}/{i}"   # e.g. /update/1, /update/2 ...
    response = requests.post(url, json=topic, headers=headers)

    print(f"Updating ID {i}")
    print("Title:", topic["Title"])
    print("Status Code:", response.status_code)
    print("Response:", response.text)
    print("-" * 50)


Updating ID 1
Title: Evaluating the economic impact of geothermal energy projects across the US
Status Code: 200
Response: {"message":"Key Topics Updated"}
--------------------------------------------------
Updating ID 2
Title: Key technical aspects and core designs for geothermal plant operations
Status Code: 200
Response: {"message":"Key Topics Updated"}
--------------------------------------------------
Updating ID 3
Title: Latest geothermal regulatory updates on frameworks, permits, and incentives
Status Code: 200
Response: {"message":"Key Topics Updated"}
--------------------------------------------------
Updating ID 4
Title: Advanced EGS technologies for resource optimization and energy production 
Status Code: 200
Response: {"message":"Key Topics Updated"}
--------------------------------------------------
Updating ID 5
Title: Innovative ways to integrate geothermal power plants with the power grid
Status Code: 200
Response: {"message":"Key Topics Updated"}
---------------------

# Statistics


In [224]:
def build_prompt():
    return f"""
As a data extraction task, meticulously examine the provided image (original and cropped segments). The objective is to precisely identify the count or value associated with the following specific metrics displayed within the visual:
Total Industry Topics
Number of Networking Events
Quantity of Leading Experts
Number of Q&A Sessions
Output the extracted data points clearly in json format 
"""


In [227]:
from PIL import Image
import os
import google.generativeai as genai

def resize_image(pil_image, max_size=2000, save_path="resized_page.png"):
    """Resize image to max_size while keeping aspect ratio. Save for inspection."""
    width, height = pil_image.size
    if max(width, height) > max_size:
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
        pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
    # Save to disk so you can verify what Gemini sees
    pil_image.save(save_path)
    print(f"📸 Resized image saved at {os.path.abspath(save_path)} ({pil_image.size[0]}x{pil_image.size[1]})")
    return pil_image

def ocr_with_gemini(pil_image):
    model = genai.GenerativeModel("gemma-3-4b-it")
    query = build_prompt()

    # Resize before sending
    safe_img = resize_image(pil_image)

    response = model.generate_content([query, safe_img])
    return response.text


In [228]:
import google.generativeai as genai
from pdf2image import convert_from_path
from PIL import Image
import os

# 1. Configure Gemini API
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Extract pages of PDF as a PIL image
def get_page_image(pdf_path, page_num):
    images = convert_from_path(
        pdf_path,
        first_page=page_num,
        last_page=page_num,
        poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]  # Return PIL.Image.Image object
    return None

# 3. Prompt builder for event details

# 5. Main
if __name__ == "__main__":
    pdf_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"

    first_page_img = get_page_image(pdf_path, 2)

    if first_page_img :
        # Extract text from first page
        first_page_text = ocr_with_gemini(
            first_page_img
            # "From the provided form image, identify the industry option that is marked or checked. Return only the industry name (e.g., 'Clean Energy')."
        )
        print("Extracted text from first page image:\n")
        
        # Save output JSON
        with open("Statistics.json", "w", encoding="utf-8") as f:
            f.write(first_page_text)

        print("✅ JSON saved to gemini_response.json")
        print(first_page_text)

    # else:
    #     print("❌ Could not extract images from PDF.")

📸 Resized image saved at c:\Users\ali.zain\Desktop\Content_Extraction\research\resized_page.png (450x2000)
Extracted text from first page image:

✅ JSON saved to gemini_response.json
```json
{
  "Total Industry Topics": "100+",
  "Number of Networking Events": "8+",
  "Quantity of Leading Experts": "50+",
  "Number of Q&A Sessions": "20+"
}
```


In [229]:
import json

file_path = "Statistics.json"

# Step 1: Read raw file text
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Step 2: Remove wrapping triple quotes if present
if raw_text.startswith("```json"):
    raw_text = raw_text[len("```json"):].strip()
if raw_text.endswith("```"):
    raw_text = raw_text[:-3].strip()

# Step 3: Load cleaned JSON
data = json.loads(raw_text)

# Step 4: Transform into payload format
payload = []
for key, value in data.items():
    figure = value.replace("+", "")   # strip '+' for figure
    caption = key.upper()             # uppercase caption
    isplus = "yes" if "+" in value else "no"

    payload.append({
        "figure": figure,
        "caption": caption,
        "isplus": isplus
    })

# Step 5: Save back to same file (or new file if you prefer)
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2, ensure_ascii=False)

print("✅ Updated payload saved to:", file_path)


✅ Updated payload saved to: Statistics.json


In [230]:
import requests

# Your API base URL
base_url = "https://ai-demo.genetechz.com/api/statistics/update/"

# The payload you provided
with open("Statistics.json", "r", encoding="utf-8") as f:
    payload = json.load(f)

# Loop over payload and send POST requests
for i, item in enumerate(payload, start=1):
    url = f"{base_url}{i}"   # auto increment id in URL
    response = requests.post(url, json=item)

    print(f"Request to {url}")
    print("Payload:", item)
    print("Status Code:", response.status_code)
    try:
        print("Response:", response.json())
    except:
        print("Response Text:", response.text)
    print("-" * 50)


Request to https://ai-demo.genetechz.com/api/statistics/update/1
Payload: {'figure': '100', 'caption': 'TOTAL INDUSTRY TOPICS', 'isplus': 'yes'}
Status Code: 200
Response: {'message': 'Statistics Updated'}
--------------------------------------------------
Request to https://ai-demo.genetechz.com/api/statistics/update/2
Payload: {'figure': '8', 'caption': 'NUMBER OF NETWORKING EVENTS', 'isplus': 'yes'}
Status Code: 200
Response: {'message': 'Statistics Updated'}
--------------------------------------------------
Request to https://ai-demo.genetechz.com/api/statistics/update/3
Payload: {'figure': '50', 'caption': 'QUANTITY OF LEADING EXPERTS', 'isplus': 'yes'}
Status Code: 200
Response: {'message': 'Statistics Updated'}
--------------------------------------------------
Request to https://ai-demo.genetechz.com/api/statistics/update/4
Payload: {'figure': '20', 'caption': 'NUMBER OF Q&A SESSIONS', 'isplus': 'yes'}
Status Code: 200
Response: {'message': 'Statistics Updated'}
--------------

# Expert Speaker

In [93]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [232]:
def build_prompt():
    return """
As a data extraction task, meticulously examine the provided image 
The objective is to precisely identify the value associated with the Expert Speakers within the visual:
- Speaker name 
- Company name

There are total 3 expert speakers and their company name  .
write a proper name if there is any typo in the image.
Output the extracted data points clearly in JSON format:

{
  "expert_speakers": [
    {
      "name": "",
      "company": ""
    },
    {
      "name": "",
      "company": ""
    },
    {
      "name": "",
      "company": ""
    }
  ]
}
"""

In [233]:
from PIL import Image
import google.generativeai as genai
import io

# 1. Configure API key
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Load the model
model = genai.GenerativeModel("gemini-2.5-flash")

# 3. Open image and convert to bytes
with Image.open(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\crop_image.png") as img:
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")
    img_bytes.seek(0)

query=build_prompt()
# 4. Ask Gemini with image + text prompt
response = model.generate_content([
    {
        "mime_type": "image/png",
        "data": img_bytes.read()
    },
    query
])
# 5. Print response
print(response.text)
with open("Expert_Speaker.json", "w", encoding="utf-8") as f:
    f.write(response.text)  


```json
{
  "expert_speakers": [
    {
      "name": "Agung Setyadi",
      "company": "Geo Dipa Energ"
    },
    {
      "name": "Karl Farrow",
      "company": "CeraPhi Energy"
    },
    {
      "name": "Mazda Irani",
      "company": "Ashaw Energy"
    }
  ]
}
```


In [234]:
import json

file_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\Expert_Speaker.json"
# Step 1: Read raw file text
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Step 2: Remove wrapping triple quotes if present
if raw_text.startswith("```json"):
    raw_text = raw_text[len("```json"):].strip()
if raw_text.endswith("```"):
    raw_text = raw_text[:-3].strip()

# Step 3: Load cleaned JSON
data = json.loads(raw_text)

In [None]:
import json
import requests

# Your API endpoint


# Loop through expert speakers and send POST request for each
count=11
for speaker in data["expert_speakers"]:
    url = "https://ai-demo.genetechz.com/api/expert-speakers/update"
    payload = {
        "name": speaker["name"],
        "company": speaker["company"]
    }
    url = f"{url}/{count}"
    count+=1
    print(url)
    response = requests.post(url, json=payload)

    print(f"Sending: {payload}")
    print(f"Status Code: {response.status_code}")
    print(f"Response: {response.text}")
    print("-" * 50)


https://ai-demo.genetechz.com/api/expert-speakers/update/11
Sending: {'name': 'Agung Setyadi', 'company': 'Geo Dipa Energ'}
Status Code: 200
Response: {"message":"Expert Speakers Updated"}
--------------------------------------------------
https://ai-demo.genetechz.com/api/expert-speakers/update/11/12
Sending: {'name': 'Karl Farrow', 'company': 'CeraPhi Energy'}
Status Code: 404
Response: <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Error</title>
</head>
<body>
<pre>Cannot POST /expert-speakers/update/11/12</pre>
</body>
</html>

--------------------------------------------------
https://ai-demo.genetechz.com/api/expert-speakers/update/11/12/13
Sending: {'name': 'Mazda Irani', 'company': 'Ashaw Energy'}
Status Code: 404
Response: <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Error</title>
</head>
<body>
<pre>Cannot POST /expert-speakers/update/11/12/13</pre>
</body>
</html>

--------------------------------------------------


# Crop Section From PDF For Past Attendees AndExpert Speaker

In [117]:
from pdf2image import convert_from_path
from PIL import Image

def get_page_image(pdf_path, page_num):
    images = convert_from_path(
        pdf_path,
        first_page=page_num,
        last_page=page_num,
        poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]  # Return PIL.Image.Image object
    return None
    
file_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"
img = get_page_image(file_path, 2)

if img:
# Example coordinates (left, top, right, bottom)
    crop_box = (886, 11100, 3250, 12347)
    cropped = img.crop(crop_box)

    cropped.show()
cropped.save(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\crop_image.jpg")

# Past Attendees

In [136]:
def build_prompt():
    return """
    You are given an image that contains different sections, including "Expert Speakers" on the left and "Past Attendees" on the right.

Your task is to carefully analyze the image and extract the names listed under the "Past Attendees" section only.

There are exactly  past attendees in this section.

Ignore other sections such as "Expert Speakers" or registration information.

Present the extracted names in a structured JSON format as follows:

{
  "Past Attendees": [
    { "name": "Attendee 1" },
    { "name": "Attendee 2" },
    { "name": "Attendee 3" },
    { "name": "Attendee 4" },
    { "name": "Attendee 5" },
    { "name": "Attendee 6" },
    { "name": "Attendee 7" },
    { "name": "Attendee 8" },
    { "name": "Attendee 9" }
  ]
}

Ensure the names are captured exactly as they appear in the image without alterations.
"""

In [137]:
from PIL import Image
import google.generativeai as genai
import io

# 1. Configure API key
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Load the model
model = genai.GenerativeModel("gemini-2.5-flash")

# 3. Open image and convert to bytes
with Image.open(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\crop_image.jpg") as img:
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")
    img_bytes.seek(0)

query=build_prompt()
# 4. Ask Gemini with image + text prompt
response = model.generate_content([
    {
        "mime_type": "image/png",
        "data": img_bytes.read()
    },
    query
])
# 5. Print response
print(response.text)
with open("past_attendees.json", "w", encoding="utf-8") as f:
    f.write(response.text)

```json
{
  "Past Attendees": [
    { "name": "Ali" },
    { "name": "Pioneer Natural Resources" },
    { "name": "CG Thermal" },
    { "name": "Chevron" },
    { "name": "ExxonMobil" },
    { "name": "TotalEnergies" },
    { "name": "Technip Energies" },
    { "name": "Zain" },
    { "name": "SLB" }
  ]
}
```


In [138]:
import json

file_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\past_attendees.json"
# Step 1: Read raw file text
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Step 2: Remove wrapping triple quotes if present
if raw_text.startswith("```json"):
    raw_text = raw_text[len("```json"):].strip()
if raw_text.endswith("```"):
    raw_text = raw_text[:-3].strip()

# Step 3: Load cleaned JSON
data = json.loads(raw_text)

In [142]:
import json
import requests

# Your API endpoint
url = "https://ai-demo.genetechz.com/api/past-attendences"


# Loop through expert speakers and send POST request for each
for speaker in data["Past Attendees"]:
    payload = {
        "attendees": speaker["name"],
    }

    response = requests.post(url, json=payload)

    print(f"Sending: {payload}")
    print(f"Status Code: {response.status_code}")
    print(f"Response: {response.text}")
    print("-" * 50)

Sending: {'attendees': 'Ali'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'Pioneer Natural Resources'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'CG Thermal'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'Chevron'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'ExxonMobil'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'TotalEnergies'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
--------------------------------------------------
Sending: {'attendees': 'Technip Energies'}
Status Code: 200
Response: {"msg":"Past Attendees Added"}
---------------

# Testimonal

In [None]:
from pdf2image import convert_from_path
from PIL import Image

def get_page_image(pdf_path, page_num):
    images = convert_from_path(
    pdf_path,
    first_page=page_num,
    last_page=page_num,
    poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]
    return None

# Load page
file=r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"
img = get_page_image(file, 2)

if img:
    crop_box = (974, 13078, 4347, 17505)
    cropped = img.crop(crop_box)
cropped.save(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\Testimonial.png")

In [144]:
def build_prompt():
    return """
    You are given an image that contains multiple testimonials. Each testimonial consists of three elements:

The testimonial text (inside quotes).

The name of the person.

The company/organization name (immediately below the name).

Your task:

Extract all testimonials in the exact order specified below.

The order of extraction is strictly top-right first → then middle section → then bottom-left last.

Do not shuffle, reorder, or skip any testimonial. Follow the sequence exactly as it appears by position.

Output format: Return the results as a JSON array with the following structure:

{
  "testimonial": [
    { "name": "Attendee 1", "company": "Company 1", "text": "Testimonial text 1" },
    { "name": "Attendee 2", "company": "Company 2", "text": "Testimonial text 2" },
    { "name": "Attendee 3", "company": "Company 3", "text": "Testimonial text 3" },
    { "name": "Attendee 4", "company": "Company 4", "text": "Testimonial text 4" },
    { "name": "Attendee 5", "company": "Company 5", "text": "Testimonial text 5" },
    { "name": "Attendee 6", "company": "Company 6", "text": "Testimonial text 6" }
  ]
}
Important:
Preserve the exact spelling, formatting, and wording of names and companies as shown in the image.
Do not modify or normalize text.
Ensure every testimonial is included in the correct order.
"""

In [145]:
from PIL import Image
import google.generativeai as genai
import io

# 1. Configure API key
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Load the model
model = genai.GenerativeModel("gemma-3-4b-it")

# 3. Open image and convert to bytes
with Image.open(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\Testimonial.png") as img:
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")
    img_bytes.seek(0)

query=build_prompt()
# 4. Ask Gemini with image + text prompt
response = model.generate_content([
    {
        "mime_type": "image/png",
        "data": img_bytes.read()
    },
    query
])
# 5. Print response
print(response.text)
with open("Testimonial.json", "w", encoding="utf-8") as f:
    f.write(response.text)

```json
{
  "testimonial": [
    {
      "name": "Jennifer Zwarch",
      "company": "Alberta Energy Regulator",
      "text": "Incredible learnings into various technologies and the state of the industry."
    },
    {
      "name": "Don Mack",
      "company": "Siemens Industry, Inc.",
      "text": "Great insights into the industry from speakers and highly engaging panel sessions."
    },
    {
      "name": "John Agele",
      "company": "Weatherford",
      "text": "Very informative event; enjoyed discovering different approaches and perspectives."
    },
    {
      "name": "Josh Brownlow",
      "company": "Pioneer Natural Resources",
      "text": "Excellent chance to engage with current industry leaders and experts."
    },
    {
      "name": "Pater Vaet",
      "company": "Go2Lithium",
      "text": "A remarkable event addressing a range of key industry issues."
    },
    {
      "name": "Michelle Mashava",
      "company": "E3 Lithium",
      "text": "Highly knowledgeable 

In [146]:
import json

file_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\Testimonial.json"
# Step 1: Read raw file text
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Step 2: Remove wrapping triple quotes if present
if raw_text.startswith("```json"):
    raw_text = raw_text[len("```json"):].strip()
if raw_text.endswith("```"):
    raw_text = raw_text[:-3].strip()

# Step 3: Load cleaned JSON
data = json.loads(raw_text)

In [147]:
import json
import requests

# Your API endpoint

count=1
# Loop through expert speakers and send POST request for each
for speaker in data["testimonial"]:
    url = "https://ai-demo.genetechz.com/api/testimonials/update"

    payload = {
        "name": speaker["name"],
        "company": speaker["company"],
        "testimonial": speaker["text"],
        "title": speaker.get("title", "Lorem"),   # keep optional if not always available
        "ishome": speaker.get("ishome", "1")
    }
    url+=f"/{count}"
    print(url)
    response = requests.post(url, json=payload)
    count+=1

    print(f"Sending: {payload}")
    print(f"Status Code: {response.status_code}")
    print(f"Response: {response.text}")
    print("-" * 50)


https://ai-demo.genetechz.com/api/testimonials/update/1
Sending: {'name': 'Jennifer Zwarch', 'company': 'Alberta Energy Regulator', 'testimonial': 'Incredible learnings into various technologies and the state of the industry.', 'title': 'Lorem', 'ishome': '1'}
Status Code: 200
Response: {"message":"Testimonials Updated"}
--------------------------------------------------
https://ai-demo.genetechz.com/api/testimonials/update/2
Sending: {'name': 'Don Mack', 'company': 'Siemens Industry, Inc.', 'testimonial': 'Great insights into the industry from speakers and highly engaging panel sessions.', 'title': 'Lorem', 'ishome': '1'}
Status Code: 200
Response: {"message":"Testimonials Updated"}
--------------------------------------------------
https://ai-demo.genetechz.com/api/testimonials/update/3
Sending: {'name': 'John Agele', 'company': 'Weatherford', 'testimonial': 'Very informative event; enjoyed discovering different approaches and perspectives.', 'title': 'Lorem', 'ishome': '1'}
Status C

# related events in the series

In [148]:
from pdf2image import convert_from_path
from PIL import Image

def get_page_image(pdf_path, page_num):
    images = convert_from_path(
    pdf_path,
    first_page=page_num,
    last_page=page_num,
    poppler_path=r"C:\Users\ali.zain\Desktop\poppler-24.08.0\Library\bin"
    )
    if images:
        return images[0]
    return None

# Load page
file=r"C:\Users\ali.zain\Desktop\Content_Extraction\content.pdf"
img = get_page_image(file, 2)

if img:
    crop_box = (955, 20260, 4413, 21854)
    cropped = img.crop(crop_box)

cropped.show()
cropped.save(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\RelatedEvents.png")



In [None]:
def build_prompt():
    return """
    You are an expert at extracting structured data from images. Analyze the provided image, which shows a section of upcoming events displayed as cards. There are multiple event cards visible.
For each event card, extract the following information and output it as a JSON array of objects. Each object should represent one event and include exactly these keys:

"eventname": The full name of the event, including any year or edition (e.g., "Direct Lithium Extraction USA 2025"). If no full name is visible, use the abbreviation or code as a fallback.
"eventlocation": The location in the format "City, State, Country" (e.g., "Orange County, California, USA"). If incomplete, use what's available.
"eventlink": If a hyperlink or URL is visible or inferable from the image, include it; otherwise, set to null.
"eventdate": The date in the format "MM DD - DD, YYYY" if specific days are given, or a more general format like "December 1 - 2,2025" if that's what's shown. Include any ordinal indicators like "1st" or "OR" alternatives if present, but prioritize the most complete date.
"image": If an image path or filename is visible or referenced in the card, include it; otherwise, set to null.
"hoverimage": If a hover image path or reference is visible, include it; otherwise, set to null.

Output only the JSON array, nothing else. Ensure the JSON is valid and well-formatted. If any field cannot be determined, set it to null. If there are multiple events, list them in the order they appear from left to right.
data = [
#   {
#     "eventname": ,
#     "eventlocation": ",
#     "eventlink": "",
#     "eventdate": ,
#     "image": None,
#     "hoverimage": None
#   },
#   {
#     "eventname":,
#     "eventlocation": ,
#     "eventlink": " ",
#     "eventdate": ,
#     "image": None,
#     "hoverimage": None
#   },
#   {
#     "eventname": ,
#     "eventlocation": ,
#     "eventlink": "",
#     "eventdate": ,
#     "image": None,
#     "hoverimage": None
#   }
# ]
"""

In [194]:
from PIL import Image
import google.generativeai as genai
import io

# 1. Configure API key
genai.configure(api_key="AIzaSyDDoos-ITDh0hl694HB2um_iqdu36jREAw")

# 2. Load the model
model = genai.GenerativeModel("gemma-3-4b-it")

# 3. Open image and convert to bytes
with Image.open(r"C:\Users\ali.zain\Desktop\Content_Extraction\research\RelatedEvents.png") as img:
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")
    img_bytes.seek(0)

query=build_prompt()
# 4. Ask Gemini with image + text prompt
response = model.generate_content([
    {
        "mime_type": "image/png",
        "data": img_bytes.read()
    },
    query
])
# 5. Print response
print(response.text)
with open("RelatedEvents.json", "w", encoding="utf-8") as f:
    f.write(response.text)

```json
[
  {
    "eventname": "Direct Lithium Extraction USA 2025",
    "eventlocation": "Orange County, California, USA",
    "eventlink": null,
    "eventdate": "December 1st-2nd, 2025",
    "image": null,
    "hoverimage": null
  },
  {
    "eventname": "Reservoir Simulation 2025",
    "eventlocation": "Houston, Texas, USA",
    "eventlink": null,
    "eventdate": "December 3rd-4th, 2025",
    "image": null,
    "hoverimage": null
  },
  {
    "eventname": "Frac Sand USA 2026",
    "eventlocation": "Houston, Texas, USA",
    "eventlink": null,
    "eventdate": "December 11th-12th, 2025",
    "image": null,
    "hoverimage": null
  }
]
```


In [195]:
import json

file_path = r"C:\Users\ali.zain\Desktop\Content_Extraction\research\RelatedEvents.json"
# Step 1: Read raw file text
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Step 2: Remove wrapping triple quotes if present
if raw_text.startswith("```json"):
    raw_text = raw_text[len("```json"):].strip()
if raw_text.endswith("```"):
    raw_text = raw_text[:-3].strip()

# Step 3: Load cleaned JSON
data = json.loads(raw_text)

In [200]:
import json
import requests

count = 4
for speaker in data:
    url = f"https://ai-demo.genetechz.com/api/upcoming-events/update/{count}"

    payload = {
        "eventname": speaker["eventname"],
        "eventlocation": speaker["eventlocation"],
        "eventlink": speaker["eventlink"],
        "eventdate": speaker["eventdate"],
        "image": speaker["image"],
        "hoverimage": speaker["hoverimage"]
    }

    print(f"Sending request to: {url}")
    response = requests.post(url, json=payload)

    if response.status_code == 200:
        print(f"✅ Success (Event {count})")
    else:
        print(f"❌ Failed (Event {count})")

    print(f"Payload: {payload}")
    print(f"Response: {response.text}")
    print("-" * 50)

    count += 1


Sending request to: https://ai-demo.genetechz.com/api/upcoming-events/update/4
✅ Success (Event 4)
Payload: {'eventname': 'Direct Lithium Extraction USA 2025', 'eventlocation': 'Orange County, California, USA', 'eventlink': None, 'eventdate': 'December 1st-2nd, 2025', 'image': None, 'hoverimage': None}
Response: {"message":"Upcoming Events Updated"}
--------------------------------------------------
Sending request to: https://ai-demo.genetechz.com/api/upcoming-events/update/5
✅ Success (Event 5)
Payload: {'eventname': 'Reservoir Simulation 2025', 'eventlocation': 'Houston, Texas, USA', 'eventlink': None, 'eventdate': 'December 3rd-4th, 2025', 'image': None, 'hoverimage': None}
Response: {"message":"Upcoming Events Updated"}
--------------------------------------------------
Sending request to: https://ai-demo.genetechz.com/api/upcoming-events/update/6
✅ Success (Event 6)
Payload: {'eventname': 'Frac Sand USA 2026', 'eventlocation': 'Houston, Texas, USA', 'eventlink': None, 'eventdate'

# Streamlit website