In [None]:
import openai
import pandas as pd
import time
import os

client = openai.OpenAI(api_key="...")

# loading examples
val_df = pd.read_csv("/Users/arashalborz/Desktop/Data/val_data.csv")
example1 = val_df.iloc[0]
example2 = val_df.iloc[1]

# Interview questions + prompt with real examples
prompt = f"""
You are generating synthetic job interview responses to the following 3 questions:

Q1. Please describe a situation where you were presented with a problem outside of your comfort zone and where you were able to come up with a creative solution.
Q2. Tell us about a time when you have failed or made a mistake. What happened? What did you learn from this experience?
Q3. Describe a situation in which you got a group of people to work together as a team. Did you encounter any issues? What was the end result?

Here are two examples:

Example 1:
Q1,"{example1['Q1']}"
Q2,"{example1['Q2']}"
Q3,"{example1['Q3']}"

Example 2:
Q1,"{example2['Q1']}"
Q2,"{example2['Q2']}"
Q3,"{example2['Q3']}"

Now generate a new set of answers in the same CSV format:
Q1,"..."
Q2,"..."
Q3,"..."
"""

# Generate synthetic data
NUM_SYNTHETIC_SAMPLES = 5
synthetic_rows = []

for i in range(NUM_SYNTHETIC_SAMPLES):
    print(f"🔄 Generating sample {i+1}/{NUM_SYNTHETIC_SAMPLES}...")
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8,
        )
        content = response.choices[0].message.content
        lines = content.strip().splitlines()
        q1 = lines[0].split(",", 1)[1].strip().strip('"')
        q2 = lines[1].split(",", 1)[1].strip().strip('"')
        q3 = lines[2].split(",", 1)[1].strip().strip('"')
        synthetic_rows.append({"Q1": q1, "Q2": q2, "Q3": q3})
    except Exception as e:
        print(f"Error: {e}")
    time.sleep(1.2)

output_df = pd.DataFrame(synthetic_rows)
output_df.to_csv("synthetic_val_data.csv", index=True)
print("Saved to synthetic_val_data.csv")

🔄 Generating sample 1/5...
🔄 Generating sample 2/5...
🔄 Generating sample 3/5...
🔄 Generating sample 4/5...
🔄 Generating sample 5/5...
✅ Saved to synthetic_val_data.csv


In [None]:
import openai
import pandas as pd
import time
import os
import tiktoken
from datetime import datetime

client = openai.OpenAI(api_key="sk...") 
enc = tiktoken.encoding_for_model("gpt-4")

val_df = pd.read_csv("/Users/arashalborz/Desktop/Data/val_data.csv")
example1 = val_df.iloc[0]
example2 = val_df.iloc[1]

prompt = f"""
You are generating synthetic job interview responses to the following 3 questions:

Q1. Please describe a situation where you were presented with a problem outside of your comfort zone and where you were able to come up with a creative solution.
Q2. Tell us about a time when you have failed or made a mistake. What happened? What did you learn from this experience?
Q3. Describe a situation in which you got a group of people to work together as a team. Did you encounter any issues? What was the end result?

Here are two examples of answers to these questions:

Example 1:
Q1,"{example1['Q1']}"
Q2,"{example1['Q2']}"
Q3,"{example1['Q3']}"

Example 2:
Q1,"{example2['Q1']}"
Q2,"{example2['Q2']}"
Q3,"{example2['Q3']}"

Now generate a new set of answers in the same CSV format:
Q1,"..."
Q2,"..."
Q3,"..."
"""

NUM_SYNTHETIC_SAMPLES = 35
synthetic_rows = []
token_stats = []

start_time = time.time()

for i in range(NUM_SYNTHETIC_SAMPLES):
    print(f"Generating sample {i+1}/{NUM_SYNTHETIC_SAMPLES}...")
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8,
        )
        content = response.choices[0].message.content

        input_tokens = len(enc.encode(prompt))
        output_tokens = len(enc.encode(content))
        token_stats.append({"input_tokens": input_tokens, "output_tokens": output_tokens})

        lines = content.strip().splitlines()
        q1 = lines[0].split(",", 1)[1].strip().strip('"')
        q2 = lines[1].split(",", 1)[1].strip().strip('"')
        q3 = lines[2].split(",", 1)[1].strip().strip('"')
        synthetic_rows.append({"Q1": q1, "Q2": q2, "Q3": q3})

    except Exception as e:
        print(f"Error: {e}")
    time.sleep(1.2)

output_df = pd.DataFrame(synthetic_rows)
output_df.to_csv("synthetic_val_data.csv", index=True)

token_df = pd.DataFrame(token_stats)
token_df.to_csv("synthetic_val_token_usage.csv", index=True)

end_time = time.time()
elapsed = end_time - start_time
total_input = sum(row["input_tokens"] for row in token_stats)
total_output = sum(row["output_tokens"] for row in token_stats)

print("Saved to synthetic_val_data.csv")
print("Token usage saved to synthetic_val_token_usage.csv")
print(f"Total time: {elapsed:.2f} seconds")
print(f"Total input tokens: {total_input}")
print(f"Total output tokens: {total_output}")

Generating sample 1/35...
Generating sample 2/35...
Generating sample 3/35...
Generating sample 4/35...
Generating sample 5/35...
Generating sample 6/35...
Generating sample 7/35...
Generating sample 8/35...
Generating sample 9/35...
Generating sample 10/35...
Generating sample 11/35...
Generating sample 12/35...
Generating sample 13/35...
Generating sample 14/35...
Generating sample 15/35...
Generating sample 16/35...
Generating sample 17/35...
Generating sample 18/35...
Generating sample 19/35...
Generating sample 20/35...
Generating sample 21/35...
Generating sample 22/35...
Generating sample 23/35...
Generating sample 24/35...
Generating sample 25/35...
Generating sample 26/35...
Generating sample 27/35...
Generating sample 28/35...
Generating sample 29/35...
Generating sample 30/35...
Generating sample 31/35...
Generating sample 32/35...
Generating sample 33/35...
Generating sample 34/35...
Generating sample 35/35...
Saved to synthetic_val_data.csv
Token usage saved to synthetic_v