# PDF Generation Workflow - Cleaned Version

This notebook processes resume-job matches and generates PDFs for different treatment types.

## Features:
- Configurable number of files to process
- Loops through all treatment types (control, Type_I, Type_II, Type_III)
- Saves output PDF links to CSV with all initial data
- Appends new records when running again

## Configuration:
- Set `num_files_to_process` to control how many files to process
- Set `test_url` and `authorization` for the API endpoint
- Results are saved to `pdf_generation_results.csv`

In [2]:
import requests
import json
import pandas as pd
import os
from datetime import datetime
import time  # Add this line

In [3]:
# Configuration
num_files_to_process = 5  # Set this to control how many files to process
test_url = "https://prayag-is-dummy.app.n8n.cloud/webhook/9eb0c4bc-f2a4-4f23-bb71-26422deedf55"
authorization = ("prayag_purohit", "Resumeaudit")
output_csv = "pdf_generation_results.csv"

# Treatment types to process
treatment_types = ['control', 'Type_I', 'Type_II', 'Type_III']

print(f"Configuration:")
print(f"- Files to process: {num_files_to_process}")
print(f"- Treatment types: {treatment_types}")
print(f"- Output CSV: {output_csv}")

Configuration:
- Files to process: 5
- Treatment types: ['control', 'Type_I', 'Type_II', 'Type_III']
- Output CSV: pdf_generation_results.csv


In [4]:
# Load the job matches data
file_path = "Resume_study.resume_job_matches_filtered.csv"
job_matches_df = pd.read_csv(file_path)

# Rename columns to match our endpoint requirements
job_matches_df.rename(
    columns={
        'description': 'job_description',
        'tile': 'job_title'
    }, inplace=True)

print(f"Loaded {len(job_matches_df)} job matches")
print(f"Columns: {list(job_matches_df.columns)}")
job_matches_df.head()

Loaded 1226 job matches
Columns: ['_id', 'job_posting_id', 'title', 'job_description', 'file_id', 'key_metrics.basics.likely_home_country', 'match_score']


Unnamed: 0,_id,job_posting_id,title,job_description,file_id,key_metrics.basics.likely_home_country,match_score
0,689ce96155b9b4e9132a214a,68866fd668d6a4c9cb19f9a2,Technical Business Analyst,Requisition ID: 229966 \n\n \n\nJoin a purpo...,ITC resume 17.pdf,India,95
1,68a29ca54105e44264b851f7,689d5acce78d625301071376,Database Developer (Software Developer),Job Description\nDatabase Developer\nThis is a...,ITC resume 20.pdf,India,90
2,68a29cf6ff6560afd37a01a3,689d5acce78d62530107137b,"Application Developer, D365 Finance & Operations",Sporting Life Group is a proudly Canadian fami...,ITC resume 14.pdf,Saudi Arabia,75
3,68a29cf7ff6560afd37a01a4,689d5acce78d62530107137c,Backend Developer (Python),**Please note before applying:** \n\n* We’re ...,ITC resume 18.pdf,Eritrea,92
4,68a29d01ff6560afd37a01a6,689d5acce78d62530107137d,Full Stack Developer,**Please note before applying:** \n\n* We’re ...,ITC resume 18.pdf,Eritrea,90


In [5]:
# Get unique files to process
unique_files_df = job_matches_df.drop_duplicates(subset='file_id')
print(f"Total unique files: {len(unique_files_df)}")

# Limit to the number of files specified in configuration
files_to_process = unique_files_df.head(num_files_to_process)
print(f"Files to process:")
for idx, row in files_to_process.iterrows():
    print(f"  {row['file_id']}")

Total unique files: 18
Files to process:
  ITC resume 17.pdf
  ITC resume 20.pdf
  ITC resume 14.pdf
  ITC resume 18.pdf
  ITC resume 09.pdf


In [6]:
# Load existing results if available
existing_results = []
if os.path.exists(output_csv):
    existing_results = pd.read_csv(output_csv).to_dict('records')
    print(f"Loaded {len(existing_results)} existing results from {output_csv}")
else:
    print(f"No existing results found. Will create new {output_csv}")

# Initialize results list
new_results = []

No existing results found. Will create new pdf_generation_results.csv


In [7]:
# Process each file with each treatment type
total_operations = len(files_to_process) * len(treatment_types)
current_operation = 0
stop_processing = False

print(f"Starting processing of {total_operations} operations...")
print("=" * 60)

for file_idx, (_, file_row) in enumerate(files_to_process.iterrows()):
    if stop_processing:
        break
    
    print(f"\nProcessing file {file_idx + 1}/{len(files_to_process)}: {file_row['file_id']}")
    print("-" * 40)
    
    for treatment_idx, treatment_type in enumerate(treatment_types):
        if stop_processing:
            break
            
        current_operation += 1
        print(f"  Treatment {treatment_idx + 1}/{len(treatment_types)}: {treatment_type}")
        
        try:
            # Create request body
            request_body = file_row.to_dict()
            request_body['treatment_type'] = treatment_type
            
            # Add required fields if not present
            if 'name' not in request_body:
                request_body['name'] = 'Test User'
            if 'email' not in request_body:
                request_body['email'] = 'test@example.com'
            if 'phone' not in request_body:
                request_body['phone'] = '123-456-7890'
            
            # Send request
            print(f"    📤 Sending request for {treatment_type}...")
            response = requests.post(test_url, json=request_body, auth=authorization)
            
            if response.status_code == 200:
                response_data = response.json()
                
                # Extract PDF link
                if 'webViewLink' in response_data:
                    pdf_link = response_data['webViewLink']
                    download_link = response_data.get('webContentLink', '')
                    file_id = response_data.get('id', '')
                else:
                    # Fallback for older API response format
                    pdf_link = response_data.get('download_url', '')
                    download_link = ''
                    file_id = ''
                
                # Create result record
                result_record = {
                    'timestamp': datetime.now().isoformat(),
                    'file_id': file_row['file_id'],
                    'treatment_type': treatment_type,
                    'pdf_link': pdf_link,
                    'download_link': download_link,
                    'google_drive_id': file_id,
                    'status': 'success',
                    'response_status': response.status_code
                }
                
                # Add all original data from the file row
                for col, value in file_row.items():
                    if col not in result_record:
                        result_record[col] = value
                
                new_results.append(result_record)
                print(f"    ✓ Success: {pdf_link}")
                
            else:
                print(f"    ✗ Failed: HTTP {response.status_code}")
                
                # Check if it's a 404 error and stop processing
                if response.status_code == 404:
                    print(f"    �� 404 Error detected. Stopping all processing.")
                    print(f"    Last processed: File {file_row['file_id']}, Treatment {treatment_type}")
                    
                    # Create error record for the failed operation
                    result_record = {
                        'timestamp': datetime.now().isoformat(),
                        'file_id': file_row['file_id'],
                        'treatment_type': treatment_type,
                        'pdf_link': '',
                        'download_link': '',
                        'google_drive_id': '',
                        'status': 'failed_404_stopped',
                        'response_status': response.status_code,
                        'error_message': response.text
                    }
                    
                    # Add all original data from the file row
                    for col, value in file_row.items():
                        if col not in result_record:
                            result_record[col] = value
                    
                    new_results.append(result_record)
                    
                    # Set flag to stop processing and break out of inner loop
                    stop_processing = True
                    break
                
                # Create error record for other HTTP errors
                result_record = {
                    'timestamp': datetime.now().isoformat(),
                    'file_id': file_row['file_id'],
                    'treatment_type': treatment_type,
                    'pdf_link': '',
                    'download_link': '',
                    'google_drive_id': '',
                    'status': 'failed',
                    'response_status': response.status_code,
                    'error_message': response.text
                }
                
                # Add all original data from the file row
                for col, value in file_row.items():
                    if col not in result_record:
                        result_record[col] = value
                
                new_results.append(result_record)
                
        except Exception as e:
            print(f"    ✗ Error: {str(e)}")
            
            # Create error record
            result_record = {
                'timestamp': datetime.now().isoformat(),
                'file_id': file_row['file_id'],
                'treatment_type': treatment_type,
                'pdf_link': '',
                'download_link': '',
                'google_drive_id': '',
                'status': 'error',
                'response_status': '',
                'error_message': str(e)
            }
            
            # Add all original data from the file row
            for col, value in file_row.items():
                if col not in result_record:
                    result_record[col] = value
            
            new_results.append(result_record)
        
        # Progress update
        print(f"    Progress: {current_operation}/{total_operations} ({current_operation/total_operations*100:.1f}%)")
        
        # Add delay between requests to avoid overwhelming n8n
        if not stop_processing:
            print(f"    ⏳ Waiting 3 seconds before next request...")
            time.sleep(3)  # Wait 3 seconds between requests
            print(f"    ▶️ Continuing to next request...")

print("\n" + "=" * 60)
print(f"Processing completed! Generated {len(new_results)} new results.")

Starting processing of 20 operations...

Processing file 1/5: ITC resume 17.pdf
----------------------------------------
  Treatment 1/4: control
    📤 Sending request for control...
    ✓ Success: https://drive.google.com/file/d/10ViuPPxuSPlJW5iDHc7fW6okTNQpr-q1/view?usp=drivesdk
    Progress: 1/20 (5.0%)
    ⏳ Waiting 3 seconds before next request...
    ▶️ Continuing to next request...
  Treatment 2/4: Type_I
    📤 Sending request for Type_I...
    ✓ Success: https://drive.google.com/file/d/17Kauj7gr9FMsKvQtCJrda34avGJI1SM_/view?usp=drivesdk
    Progress: 2/20 (10.0%)
    ⏳ Waiting 3 seconds before next request...
    ▶️ Continuing to next request...
  Treatment 3/4: Type_II
    📤 Sending request for Type_II...
    ✓ Success: https://drive.google.com/file/d/1QaIJkJtlyUnKMd3b0wZbojqRFk1bU8mR/view?usp=drivesdk
    Progress: 3/20 (15.0%)
    ⏳ Waiting 3 seconds before next request...
    ▶️ Continuing to next request...
  Treatment 4/4: Type_III
    📤 Sending request for Type_III...
  

In [8]:
# Combine existing and new results
all_results = existing_results + new_results
results_df = pd.DataFrame(all_results)

# Save to CSV
results_df.to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")
print(f"Total records: {len(results_df)}")
print(f"New records added: {len(new_results)}")

# Display summary
print("\nSummary:")
print(f"- Success: {len(results_df[results_df['status'] == 'success'])}")
print(f"- Failed: {len(results_df[results_df['status'] == 'failed'])}")
print(f"- Errors: {len(results_df[results_df['status'] == 'error'])}")

# Show first few results
print("\nFirst few results:")
results_df[['timestamp', 'file_id', 'treatment_type', 'status', 'pdf_link']].head(10)

Results saved to pdf_generation_results.csv
Total records: 20
New records added: 20

Summary:
- Success: 20
- Failed: 0
- Errors: 0

First few results:


Unnamed: 0,timestamp,file_id,treatment_type,status,pdf_link
0,2025-08-21T16:04:46.491375,ITC resume 17.pdf,control,success,https://drive.google.com/file/d/10ViuPPxuSPlJW...
1,2025-08-21T16:05:34.696647,ITC resume 17.pdf,Type_I,success,https://drive.google.com/file/d/17Kauj7gr9FMsK...
2,2025-08-21T16:06:28.159364,ITC resume 17.pdf,Type_II,success,https://drive.google.com/file/d/1QaIJkJtlyUnKM...
3,2025-08-21T16:07:23.891111,ITC resume 17.pdf,Type_III,success,https://drive.google.com/file/d/18fqIcoDfwJtdf...
4,2025-08-21T16:07:57.974356,ITC resume 20.pdf,control,success,https://drive.google.com/file/d/1htQ7xeqqF55UZ...
5,2025-08-21T16:08:37.520049,ITC resume 20.pdf,Type_I,success,https://drive.google.com/file/d/1Sj5xheRcx7tiB...
6,2025-08-21T16:09:10.207873,ITC resume 20.pdf,Type_II,success,https://drive.google.com/file/d/1N_k-6EjZeS69_...
7,2025-08-21T16:09:50.551174,ITC resume 20.pdf,Type_III,success,https://drive.google.com/file/d/17NsJuWDKhz72H...
8,2025-08-21T16:10:42.449092,ITC resume 14.pdf,control,success,https://drive.google.com/file/d/1pMgRmrNGNYecA...
9,2025-08-21T16:11:31.783429,ITC resume 14.pdf,Type_I,success,https://drive.google.com/file/d/11uswsbbAnXnrX...


In [9]:
# Optional: Display detailed results for a specific file
if len(new_results) > 0:
    print("Detailed results for the first processed file:")
    first_file_id = new_results[0]['file_id']
    file_results = results_df[results_df['file_id'] == first_file_id]
    
    for _, row in file_results.iterrows():
        print(f"\nTreatment: {row['treatment_type']}")
        print(f"Status: {row['status']}")
        if row['status'] == 'success':
            print(f"PDF Link: {row['pdf_link']}")
        else:
            print(f"Error: {row.get('error_message', 'Unknown error')}")

Detailed results for the first processed file:

Treatment: control
Status: success
PDF Link: https://drive.google.com/file/d/10ViuPPxuSPlJW5iDHc7fW6okTNQpr-q1/view?usp=drivesdk

Treatment: Type_I
Status: success
PDF Link: https://drive.google.com/file/d/17Kauj7gr9FMsKvQtCJrda34avGJI1SM_/view?usp=drivesdk

Treatment: Type_II
Status: success
PDF Link: https://drive.google.com/file/d/1QaIJkJtlyUnKMd3b0wZbojqRFk1bU8mR/view?usp=drivesdk

Treatment: Type_III
Status: success
PDF Link: https://drive.google.com/file/d/18fqIcoDfwJtdfKd2FTfOhHLn2DWIKDPN/view?usp=drivesdk
