# PDF Generation Workflow - Cleaned Version

This notebook processes resume-job matches and generates PDFs for different treatment types.

## Features:
- Configurable number of files to process
- Loops through all treatment types (control, Type_I, Type_II, Type_III)
- Saves output PDF links to CSV with all initial data
- Appends new records when running again

## Configuration:
- Set `num_files_to_process` to control how many files to process
- Set `test_url` and `authorization` for the API endpoint
- Results are saved to `pdf_generation_results.csv`

In [None]:
import requests
import json
import pandas as pd
import os
from datetime import datetime

In [None]:
# Configuration
num_files_to_process = 5  # Set this to control how many files to process
test_url = "https://prayag-is-dummy.app.n8n.cloud/webhook-test/ff2b5512-8aea-4eaf-b395-1c607c68d1a8"
authorization = ("prayag_purohit", "Resumeaudit")
output_csv = "pdf_generation_results.csv"

# Treatment types to process
treatment_types = ['control', 'Type_I', 'Type_II', 'Type_III']

print(f"Configuration:")
print(f"- Files to process: {num_files_to_process}")
print(f"- Treatment types: {treatment_types}")
print(f"- Output CSV: {output_csv}")

In [None]:
# Load the job matches data
file_path = "Resume_study.resume_job_matches_filtered.csv"
job_matches_df = pd.read_csv(file_path)

# Rename columns to match our endpoint requirements
job_matches_df.rename(
    columns={
        'description': 'job_description',
        'tile': 'job_title'
    }, inplace=True)

print(f"Loaded {len(job_matches_df)} job matches")
print(f"Columns: {list(job_matches_df.columns)}")
job_matches_df.head()

In [None]:
# Get unique files to process
unique_files_df = job_matches_df.drop_duplicates(subset='file_id')
print(f"Total unique files: {len(unique_files_df)}")

# Limit to the number of files specified in configuration
files_to_process = unique_files_df.head(num_files_to_process)
print(f"Files to process:")
for idx, row in files_to_process.iterrows():
    print(f"  {row['file_id']}")

In [None]:
# Load existing results if available
existing_results = []
if os.path.exists(output_csv):
    existing_results = pd.read_csv(output_csv).to_dict('records')
    print(f"Loaded {len(existing_results)} existing results from {output_csv}")
else:
    print(f"No existing results found. Will create new {output_csv}")

# Initialize results list
new_results = []

In [None]:
# Process each file with each treatment type
total_operations = len(files_to_process) * len(treatment_types)
current_operation = 0

print(f"Starting processing of {total_operations} operations...")
print("=" * 60)

for file_idx, (_, file_row) in enumerate(files_to_process.iterrows()):
    print(f"\nProcessing file {file_idx + 1}/{len(files_to_process)}: {file_row['file_id']}")
    print("-" * 40)
    
    for treatment_idx, treatment_type in enumerate(treatment_types):
        current_operation += 1
        print(f"  Treatment {treatment_idx + 1}/{len(treatment_types)}: {treatment_type}")
        
        try:
            # Create request body
            request_body = file_row.to_dict()
            request_body['treatment_type'] = treatment_type
            
            # Add required fields if not present
            if 'name' not in request_body:
                request_body['name'] = 'Test User'
            if 'email' not in request_body:
                request_body['email'] = 'test@example.com'
            if 'phone' not in request_body:
                request_body['phone'] = '123-456-7890'
            
            # Send request
            response = requests.post(test_url, json=request_body, auth=authorization)
            
            if response.status_code == 200:
                response_data = response.json()
                
                # Extract PDF link
                if 'webViewLink' in response_data:
                    pdf_link = response_data['webViewLink']
                    download_link = response_data.get('webContentLink', '')
                    file_id = response_data.get('id', '')
                else:
                    # Fallback for older API response format
                    pdf_link = response_data.get('download_url', '')
                    download_link = ''
                    file_id = ''
                
                # Create result record
                result_record = {
                    'timestamp': datetime.now().isoformat(),
                    'file_id': file_row['file_id'],
                    'treatment_type': treatment_type,
                    'pdf_link': pdf_link,
                    'download_link': download_link,
                    'google_drive_id': file_id,
                    'status': 'success',
                    'response_status': response.status_code
                }
                
                # Add all original data from the file row
                for col, value in file_row.items():
                    if col not in result_record:
                        result_record[col] = value
                
                new_results.append(result_record)
                print(f"    ✓ Success: {pdf_link}")
                
            else:
                print(f"    ✗ Failed: HTTP {response.status_code}")
                
                # Create error record
                result_record = {
                    'timestamp': datetime.now().isoformat(),
                    'file_id': file_row['file_id'],
                    'treatment_type': treatment_type,
                    'pdf_link': '',
                    'download_link': '',
                    'google_drive_id': '',
                    'status': 'failed',
                    'response_status': response.status_code,
                    'error_message': response.text
                }
                
                # Add all original data from the file row
                for col, value in file_row.items():
                    if col not in result_record:
                        result_record[col] = value
                
                new_results.append(result_record)
                
        except Exception as e:
            print(f"    ✗ Error: {str(e)}")
            
            # Create error record
            result_record = {
                'timestamp': datetime.now().isoformat(),
                'file_id': file_row['file_id'],
                'treatment_type': treatment_type,
                'pdf_link': '',
                'download_link': '',
                'google_drive_id': '',
                'status': 'error',
                'response_status': '',
                'error_message': str(e)
            }
            
            # Add all original data from the file row
            for col, value in file_row.items():
                if col not in result_record:
                    result_record[col] = value
            
            new_results.append(result_record)
        
        # Progress update
        print(f"    Progress: {current_operation}/{total_operations} ({current_operation/total_operations*100:.1f}%)")

print("\n" + "=" * 60)
print(f"Processing completed! Generated {len(new_results)} new results.")

In [None]:
# Combine existing and new results
all_results = existing_results + new_results
results_df = pd.DataFrame(all_results)

# Save to CSV
results_df.to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")
print(f"Total records: {len(results_df)}")
print(f"New records added: {len(new_results)}")

# Display summary
print("\nSummary:")
print(f"- Success: {len(results_df[results_df['status'] == 'success'])}")
print(f"- Failed: {len(results_df[results_df['status'] == 'failed'])}")
print(f"- Errors: {len(results_df[results_df['status'] == 'error'])}")

# Show first few results
print("\nFirst few results:")
results_df[['timestamp', 'file_id', 'treatment_type', 'status', 'pdf_link']].head(10)

In [None]:
# Optional: Display detailed results for a specific file
if len(new_results) > 0:
    print("Detailed results for the first processed file:")
    first_file_id = new_results[0]['file_id']
    file_results = results_df[results_df['file_id'] == first_file_id]
    
    for _, row in file_results.iterrows():
        print(f"\nTreatment: {row['treatment_type']}")
        print(f"Status: {row['status']}")
        if row['status'] == 'success':
            print(f"PDF Link: {row['pdf_link']}")
        else:
            print(f"Error: {row.get('error_message', 'Unknown error')}")