# 3-Stage Training Document Generation

This notebook implements a 3-stage process for generating training documents:
1. Use Gemini to analyze video and extract knowledge points
2. Use Gemini to select timestamps for screenshots (3 separate API calls)
3. Use OpenAI GPT-4o to curate screenshots and captions
4. Generate final DOCX document

In [1]:
# Import necessary libraries
import os
import json
import time
from google import genai
from openai import OpenAI
from dotenv import load_dotenv
import base64
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re
from IPython.display import display, Image

# Load environment variables with API keys
load_dotenv()

# Initialize API clients
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

gemni_client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
openai_client = OpenAI(api_key=OPENAI_API_KEY)

# Import prompts
from prompts_Three_Stage import stage_1_prompt100, stage_2_prompt100, stage_3_prompt100

## Configure Paths and Settings

In [2]:
# Configure paths and settings
video_path = "/Users/chaozhang/Downloads/AI KT/KT Recording/modify table in EDW using git.mp4"
job_id = int(time.time())  # Generate a unique ID for this job

# Create directories for outputs
base_folder = f"training_job_{job_id}"
os.makedirs(base_folder, exist_ok=True)

# Folders for the 3 API attempts in stage 2
screenshots_folders = [
    os.path.join(base_folder, f"screenshots_attempt_{i+1}") for i in range(3)
]
for folder in screenshots_folders:
    os.makedirs(folder, exist_ok=True)

# Final output paths
output_json_path = os.path.join(base_folder, "training_data.json")
output_docx_path = os.path.join(base_folder, "training_document.docx")

print(f"Job ID: {job_id}")
print(f"Output folder: {base_folder}")

Job ID: 1744000241
Output folder: training_job_1744000241


## Helper Functions

In [5]:
# Helper functions

def extract_screenshots(video_path, timestamps, output_folder, knowledge_point_index, api_attempt_index):
    """Extract screenshots from video at given timestamps"""
    screenshot_paths = []
    
    try:
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Could not open video file {video_path}")
            return []
        
        # Get video properties
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = total_frames / fps if fps > 0 else 0
        
        print(f"Video properties: Duration={duration:.2f}s, FPS={fps:.2f}, Total frames={total_frames}")
        
        # Process each timestamp
        for screenshot_index, timestamp in enumerate(timestamps):
            try:
                # Parse timestamp (assuming format like "1:30")
                if ':' in timestamp:
                    minutes, seconds = timestamp.split(':')
                    time_in_seconds = int(minutes) * 60 + float(seconds)
                else:
                    # If only seconds are provided
                    time_in_seconds = float(timestamp)
                
                # Skip if timestamp is beyond video duration
                if duration > 0 and time_in_seconds > duration:
                    print(f"Timestamp {timestamp} exceeds video duration of {duration:.2f}s")
                    continue
                
                # Set the frame position
                cap.set(cv2.CAP_PROP_POS_MSEC, time_in_seconds * 1000)
                
                # Read the frame
                success, frame = cap.read()
                if success:
                    # Generate filename using the specified format
                    screenshot_filename = f"{knowledge_point_index+1}_{screenshot_index+1}_{api_attempt_index+1}.png"
                    screenshot_path = os.path.join(output_folder, screenshot_filename)
                    
                    # Save the frame
                    cv2.imwrite(screenshot_path, frame)
                    screenshot_paths.append(screenshot_path)
                    print(f"Saved screenshot: {screenshot_path}")
                else:
                    print(f"Failed to capture screenshot at timestamp {timestamp}")
            
            except Exception as e:
                print(f"Error processing timestamp {timestamp}: {str(e)}")
        
        # Release the video capture
        cap.release()
        
    except Exception as e:
        print(f"Error in extract_screenshots: {str(e)}")
    
    return screenshot_paths

def image_to_base64(image_path):
    """Convert an image file to base64 encoded string"""
    try:
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
            return encoded_string
    except Exception as e:
        print(f"Error encoding image to base64: {str(e)}")
        return None

def parse_gemini_response(response_text):
    """Parse the response from Gemini to extract JSON"""
    try:
        # Try to parse the entire response as JSON
        return json.loads(response_text)
    except json.JSONDecodeError:
        # If that fails, look for JSON content in markdown code blocks
        json_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
        matches = re.findall(json_pattern, response_text)
        
        if matches:
            try:
                return json.loads(matches[0])
            except json.JSONDecodeError:
                print(f"Failed to parse JSON from code block: {matches[0]}")
                
        # If no code blocks or parsing failed, try to find JSON-like structures
        start_idx = response_text.find('{')
        end_idx = response_text.rfind('}')
        if start_idx >= 0 and end_idx > start_idx:
            try:
                json_str = response_text[start_idx:end_idx+1]
                return json.loads(json_str)
            except json.JSONDecodeError:
                print(f"Failed to parse JSON-like structure: {json_str}")
                
        print("Could not extract valid JSON from response")
        print("Response text:")
        print(response_text)
        return None

## Stage 1: Use Gemini to Extract Knowledge Points from Video

In [6]:
from ai_service import generate_training_document

# Run Stage 1
user_prompt = "Create the training document for this video"
stage1_result_response = generate_training_document(stage_1_prompt100, user_prompt, video_path)

try:
    stage1_result = json.loads(stage1_result_response)
    with open(os.path.join(base_folder, f"stage1_result.json"), 'w') as f:
        json.dump(stage1_result, f, indent=2)
except:
    print("Failed to parse stage 1 result")
    stage1_result = stage1_result_response
    with open(os.path.join(base_folder, f"stage1_result_raw.txt"), 'w') as f:
        f.write(stage1_result)

In [8]:
len(stage1_result['knowledge_points'])

29

## Stage 2: Use Gemini to Select Timestamps (3 Attempts)

In [9]:
# Stage 2: Use Gemini to select timestamps for knowledge points
from ai_service import generate_training_document

def stage2_select_timestamps(video_path, stage1_result):
    print("=== Stage 2: Selecting Timestamps for Screenshots ===")
    
    # Check if we have the required data from Stage 1
    if not stage1_result or 'knowledge_points' not in stage1_result:
        print("Error: Missing required data from Stage 1")
        return None

    # Create prompt with video and knowledge points
    knowledge_points = stage1_result['knowledge_points']
    
    # Replace placeholder in prompt template
    prompt = stage_2_prompt100.replace("{{summary_from_stage_1}}", json.dumps(stage1_result.get('summary', '')))
      
    # Run 3 separate API calls and collect timestamps
    all_attempt_results = []
    all_screenshot_paths = []
    
    for attempt in range(3):
        print(f"\nAttempt {attempt+1}/3: Calling Gemini API for timestamp selection...")
        
        # Call the Gemini API
        user_prompt = f'''
        Provide timestamps of screenshots for demonstrating each knowledge point in below list 
        {knowledge_points}
        '''
        response_text = generate_training_document(prompt, user_prompt, video_path)
        
        # Save the raw response
        raw_response_path = os.path.join(base_folder, f"stage2_raw_response_attempt_{attempt+1}.txt")
        with open(raw_response_path, 'w') as f:
            f.write(response_text)
        
        timestamps_data = json.loads(response_text)
        
        if not timestamps_data or not isinstance(timestamps_data, list):
            print(f"Attempt {attempt+1}: Failed to parse response from Gemini")
            continue
        
        # Save parsed response
        parsed_response_path = os.path.join(base_folder, f"stage2_parsed_response_attempt_{attempt+1}.json")
        with open(parsed_response_path, 'w') as f:
            json.dump(timestamps_data, f, indent=2)
        
        print(f"Attempt {attempt+1}: Successfully parsed response")
        
        # Extract timestamps for each knowledge point
        attempt_screenshots = []
        
        # Extract screenshots based on timestamps
        for knowlege_point in timestamps_data:
            for knowledge_point_index, timestamps in knowlege_point.items():
                try:
                    # Convert string index to integer if needed
                    if isinstance(knowledge_point_index, str) and knowledge_point_index.isdigit():
                        knowledge_point_index = int(knowledge_point_index)
                    
                        # Extract screenshots
                        print(f"Extracting screenshots for knowledge point {knowledge_point_index+1}")
                        screenshot_paths = extract_screenshots(
                            video_path, timestamps, screenshots_folders[attempt],
                            knowledge_point_index, attempt
                        )
                        
                        attempt_screenshots.append({
                            "knowledge_point_index": knowledge_point_index,
                            "timestamps": timestamps,
                            "screenshot_paths": screenshot_paths
                        })
                except Exception as e:
                    print(f"Error processing knowledge point {knowledge_point_index}: {str(e)}")
        
        all_attempt_results.append(attempt_screenshots)
        
        # Collect all screenshot paths
        all_paths = []
        for item in attempt_screenshots:
            all_paths.extend(item["screenshot_paths"])
        all_screenshot_paths.extend(all_paths)
        
        print(f"Attempt {attempt+1}: Extracted {len(all_paths)} screenshots")
    
    # Save combined results
    stage2_result = {
        "attempt_results": all_attempt_results,
        "all_screenshot_paths": all_screenshot_paths
    }
    
    stage2_result_path = os.path.join(base_folder, "stage2_result.json")
    with open(stage2_result_path, 'w') as f:
        json.dump(stage2_result, f, indent=2)
    
    print(f"\nStage 2 completed with {len(all_screenshot_paths)} total screenshots across 3 attempts")
    return stage2_result

# Run Stage 2
stage2_result = stage2_select_timestamps(video_path, stage1_result)

=== Stage 2: Selecting Timestamps for Screenshots ===

Attempt 1/3: Calling Gemini API for timestamp selection...


Response is not valid JSON after cleaning: Extra data: line 6 column 4 (char 49)
Invalid JSON response (first 500 chars): {
    "0": [
      "0:16",
      "0:21"
    ]
  },
  {
    "1": [
      "0:24",
      "0:42"
    ]
  },
  {
    "2": [
      "0:58",
      "1:02"
    ]
  },
  {
    "3": [
      "1:06",
      "1:17"
    ]
  },
  {
    "4": [
      "1:33",
      "1:55"
    ]
  },
  {
    "5": [
      "2:01",
      "2:23"
    ]
  },
  {
    "6": [
      "2:33",
      "2:39"
    ]
  },
  {
    "7": [
      "2:40",
      "2:44"
    ]
  },
  {
    "8": [
      "2:44",
      "2:47"
    ]
  },
  {
    "9": [
      "3:0


JSONDecodeError: Extra data: line 6 column 4 (char 49)

## Stage 3: Use o1 to Curate Screenshots

In [54]:
knowledge_points = stage1_result_json['knowledge_points']
curated_results = []


# Process each knowledge point
for knowledge_point_index, knowledge_point in enumerate(knowledge_points):
    
    # Collect all screenshots for this knowledge point from all attempts
    point_screenshots = []
    point_screenshot_ids = []
    
    for attempt_index, attempt_result in enumerate(stage2_result['attempt_results']):
        for item in attempt_result:
            if item["knowledge_point_index"] == knowledge_point_index:
                for screenshot_index, path in enumerate(item.get("screenshot_paths", [])):
                    # Extract base filename as ID
                    filename = os.path.basename(path)
                    name_without_ext = os.path.splitext(filename)[0]
                    print(name_without_ext)
                    
                    point_screenshots.append(path)
                    point_screenshot_ids.append(name_without_ext)

point_screenshot_ids

1_1_1
1_2_1
1_1_2
1_2_2
1_1_3
1_2_3
2_1_1
2_2_1
2_1_2
2_2_2
2_1_3
2_2_3
3_1_1
3_2_1
3_1_2
3_2_2
3_1_3
3_2_3
4_1_1
4_2_1
4_1_2
4_2_2
4_1_3
4_2_3
5_1_1
5_2_1
5_1_2
5_2_2
5_1_3
5_2_3
6_1_1
6_2_1
6_1_2
6_2_2
6_1_3
6_2_3
7_1_1
7_2_1
7_1_2
7_2_2
7_1_3
7_2_3
8_1_1
8_2_1
8_1_2
8_2_2
8_1_3
8_2_3
9_1_1
9_2_1
9_1_2
9_2_2
9_1_3
9_2_3
10_1_1
10_2_1
10_1_2
10_2_2
10_1_3
10_2_3
11_1_1
11_2_1
11_1_2
11_2_2
11_1_3
11_2_3
12_1_1
12_2_1
12_1_2
12_2_2
12_1_3
12_2_3
13_1_1
13_2_1
13_1_2
13_2_2
14_1_1
14_2_1
14_1_2
14_2_2
15_1_1
15_2_1
15_1_2
15_2_2
16_1_1
16_2_1
16_1_2
16_2_2


[]

In [57]:
# Stage 3: Use GPT-4o to curate screenshots

def stage3_curate_screenshots(stage1_result, stage2_result):
    print("=== Stage 3: Curating Screenshots with o1 ===")
    
    # Check if we have the required data from Stages 1 and 2
    if not stage1_result or 'knowledge_points' not in stage1_result:
        print("Error: Missing required data from Stage 1")
        return None
    
    if not stage2_result or 'attempt_results' not in stage2_result:
        print("Error: Missing required data from Stage 2")
        return None
    
    # Organize screenshots by knowledge point
    knowledge_points = stage1_result['knowledge_points']
    curated_results = []
    
    # Replace placeholder in prompt template
    prompt_template = stage_3_prompt100.replace("{{summary_from_stage_1}}", stage1_result.get('Summary', ''))
    
    # Process each knowledge point
    for knowledge_point_index, knowledge_point in enumerate(knowledge_points):
        print(f"\nProcessing knowledge point {knowledge_point_index+1}/{len(knowledge_points)}")
        print(f"Knowledge point: {knowledge_point[:100]}..." if len(knowledge_point) > 100 else knowledge_point)
        
        # Collect all screenshots for this knowledge point from all attempts
        point_screenshots = []
        point_screenshot_ids = []
        
        for attempt_result in stage2_result['attempt_results']:
            for item in attempt_result:
                if item["knowledge_point_index"] == knowledge_point_index:
                    for screenshot_index, path in enumerate(item.get("screenshot_paths", [])):
                        # Extract base filename as ID
                        filename = os.path.basename(path)
                        name_without_ext = os.path.splitext(filename)[0]
                        
                        point_screenshots.append(path)
                        point_screenshot_ids.append(name_without_ext)
        
        # If no screenshots found, skip this knowledge point
        if not point_screenshots:
            print(f"No screenshots found for knowledge point {knowledge_point_index+1}")
            curated_results.append({
                "knowledge_point_index": knowledge_point_index,
                "knowledge_point": knowledge_point,
                "screenshots": [],
                "captions": []
            })
            continue
        
        print(f"Found {len(point_screenshots)} screenshots for curation")
        
        # Check if there's a reasonable number of screenshots to process
        if len(point_screenshots) > 20:
            print(f"Warning: Large number of screenshots ({len(point_screenshots)}). Processing may take time.")
        
        # Create the input for GPT-4o
        user_content = [
            {"type": "text", "text": f"Knowledge point: {knowledge_point}\n\nBelow are screenshots to curate:"}
        ]
        
        # Add images to the content
        for path, screenshot_id in zip(point_screenshots, point_screenshot_ids):
            base64_image = image_to_base64(path)
            if base64_image:
                user_content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                })
                user_content.append({"type": "text", "text": f"Image ID: {screenshot_id}"})  
        
        # Call GPT-4o
        print(f"Calling o1 API for curation...")
        try:
            response = openai_client.chat.completions.create(
                model="o1",
                messages=[
                    {"role": "system", "content": prompt_template},
                    {"role": "user", "content": user_content}
                ],
                #max_tokens=4000
                reasoning_effort="low"
            )
            
            response_text = response.choices[0].message.content
            
            # Save the raw response
            raw_response_path = os.path.join(base_folder, f"stage3_raw_response_point_{knowledge_point_index+1}.txt")
            with open(raw_response_path, 'w') as f:
                f.write(response_text)
            
            # Parse the response to get curated screenshots
            curated_data = parse_gemini_response(response_text)  # Reusing the same parsing function
            
            if not curated_data or not isinstance(curated_data, dict):
                print(f"Failed to parse GPT-4o response for point {knowledge_point_index+1}")
                curated_results.append({
                    "knowledge_point_index": knowledge_point_index,
                    "knowledge_point": knowledge_point,
                    "screenshot_groups": [],
                    "selected_screenshots": [],
                    "captions": []
                })
                continue
            
            # Extract selected screenshots
            selected_ids = curated_data.get("selected_indexes", [])
            captions = curated_data.get("captions", [])
            groups = curated_data.get("groups", [])
            
            # Map IDs back to file paths
            selected_paths = []
            for selected_id in selected_ids:
                # Find the matching screenshot path
                found = False
                for i, id_val in enumerate(point_screenshot_ids):
                    if id_val == selected_id:
                        selected_paths.append(point_screenshots[i])
                        found = True
                        break
                
                if not found:
                    print(f"Warning: Selected ID {selected_id} not found in screenshots")
            
            print(f"o1 selected {len(selected_paths)} out of {len(point_screenshots)} screenshots")
            
            # Add results for this knowledge point
            curated_results.append({
                "knowledge_point_index": knowledge_point_index,
                "knowledge_point": knowledge_point,
                "screenshot_groups": groups,
                "selected_screenshots": selected_paths,
                "selected_ids": selected_ids,
                "captions": captions
            })
            
        except Exception as e:
            print(f"Error calling GPT-4o API: {str(e)}")
            # Add empty result for this knowledge point
            curated_results.append({
                "knowledge_point_index": knowledge_point_index,
                "knowledge_point": knowledge_point,
                "screenshot_groups": [],
                "selected_screenshots": [],
                "captions": []
            })
    
    # Save combined results
    stage3_result = {
        "curated_knowledge_points": curated_results,
    }
    
    stage3_result_path = os.path.join(base_folder, "stage3_result.json")
    with open(stage3_result_path, 'w') as f:
        json.dump(stage3_result, f, indent=2)
    
    # Count total selected screenshots
    total_selected = sum(len(item.get("selected_screenshots", [])) for item in curated_results)
    print(f"\nStage 3 completed with {total_selected} selected screenshots across {len(curated_results)} knowledge points")
    return stage3_result

# Run Stage 3
stage3_result = stage3_curate_screenshots(stage1_result_json, stage2_result)

=== Stage 3: Curating Screenshots with o1 ===

Processing knowledge point 1/25
Open Visual Studio Code.
Found 6 screenshots for curation
Calling o1 API for curation...
GPT-4o selected 2 out of 6 screenshots

Processing knowledge point 2/25
Open the Git repository folder in Visual Studio Code.
Found 6 screenshots for curation
Calling o1 API for curation...
GPT-4o selected 2 out of 6 screenshots

Processing knowledge point 3/25
Switch to the develop branch.
Found 6 screenshots for curation
Calling o1 API for curation...
GPT-4o selected 2 out of 6 screenshots

Processing knowledge point 4/25
Pull the latest version of the develop branch to get the latest changes.
Found 6 screenshots for curation
Calling o1 API for curation...
GPT-4o selected 1 out of 6 screenshots

Processing knowledge point 5/25
Create a new branch from the develop branch for making changes.
Found 6 screenshots for curation
Calling o1 API for curation...
GPT-4o selected 2 out of 6 screenshots

Processing knowledge point 

In [63]:
curated_results

[]

In [None]:

# Display a sample of the curated results
if stage3_result and 'curated_knowledge_points' in stage3_result:
    curated_points = stage3_result['curated_knowledge_points']
    if curated_points:
        # Find a point with screenshots to display
        for point in curated_points:
            if point.get('selected_screenshots'):
                print(f"\nSample Curated Point - Knowledge Point {point['knowledge_point_index']+1}:")
                print(f"Knowledge point: {point['knowledge_point'][:100]}..." if len(point['knowledge_point']) > 100 else point['knowledge_point'])
                print(f"Selected {len(point['selected_screenshots'])} screenshots")
                
                # Display first screenshot and caption
                if point['selected_screenshots'] and point['captions']:
                    sample_path = point['selected_screenshots'][0]
                    caption = point['captions'][0] if 0 < len(point['captions']) else "No caption"
                    print(f"Caption: {caption}")
                    display(Image(filename=sample_path, width=400))
                break

## Generate Final DOCX Document

In [60]:
# Generate final DOCX document

def generate_final_document(stage1_result, stage3_result):
    print("=== Generating Final DOCX Document ===")
    
    # Check if we have the required data
    if not stage1_result or 'Summary' not in stage1_result:
        print("Error: Missing required data from Stage 1")
        return None
    
    if not stage3_result or 'curated_knowledge_points' not in stage3_result:
        print("Error: Missing required data from Stage 3")
        return None
    
    curated_points = stage3_result['curated_knowledge_points']
    
    # Create document
    document = Document()
    
    # Add title
    title = document.add_heading('Training Document', 0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # Add Summary section
    document.add_heading('Summary', level=1)
    document.add_paragraph(stage1_result['Summary'])
    document.add_paragraph('')  # Add some space
    
    # Add knowledge points with screenshots
    document.add_heading('Knowledge Points', level=1)
    
    for point in curated_points:
        point_index = point['knowledge_point_index']
        knowledge_point = point['knowledge_point']
        
        # Add knowledge point as heading
        document.add_heading(f"{point_index+1}. {knowledge_point}", level=2)
        
        # Add screenshots with captions
        selected_screenshots = point.get('selected_screenshots', [])
        captions = point.get('captions', [])
        
        if not selected_screenshots:
            paragraph = document.add_paragraph("No relevant screenshots available for this knowledge point.")
            paragraph.style = 'Intense Quote'
            continue
            
        # Add screenshots with captions
        for i, (screenshot_path, caption) in enumerate(zip(selected_screenshots, captions)):
            try:
                # Add a separator between screenshots if not the first one
                if i > 0:
                    document.add_paragraph('')  # Add blank line between screenshots
                
                # Add the screenshot
                document.add_picture(screenshot_path, width=Inches(6.0))
                
                # Add the caption below the screenshot
                caption_paragraph = document.add_paragraph(f"Figure {point_index+1}.{i+1}: {caption}")
                caption_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
                caption_paragraph.style = 'Caption'
                
            except Exception as e:
                print(f"Error adding screenshot {screenshot_path}: {str(e)}")
                error_paragraph = document.add_paragraph(f"Error: Could not add screenshot {os.path.basename(screenshot_path)}")
                error_paragraph.style = 'Intense Quote'
    
    # Add a footer with timestamp
    section = document.sections[0]
    footer = section.footer
    footer_paragraph = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
    footer_paragraph.text = f"Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}"
    footer_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # Save the document
    try:
        document.save(output_docx_path)
        print(f"Document successfully saved: {output_docx_path}")
        return output_docx_path
    except Exception as e:
        print(f"Error saving document: {str(e)}")
        return None

# Run Stage 4: Generate Final Document
if 'stage1_result_json' in locals() and 'stage3_result' in locals():
    output_path = generate_final_document(stage1_result_json, stage3_result)
    if output_path:
        print(f"\nFinal document generated and saved to: {output_path}")
else:
    print("Missing required data to generate final document. Please run all previous stages first.")

=== Generating Final DOCX Document ===
Document successfully saved: training_job_1743966405/training_document.docx

Final document generated and saved to: training_job_1743966405/training_document.docx


In [59]:
generate_final_document(stage1_result_json, stage3_result)

=== Generating Final DOCX Document ===
