## Sampling the resume data

In [3]:
# Define file paths
input_file = "resume_samples/resume_samples.txt"  # Replace with the path to your input file
output_file = "first_600_resumes.txt"  # Path for the output file

def extract_first_600_lines(input_file, output_file):
    try:
        # Open the input file for reading
        with open(input_file, 'r', encoding='latin-1') as infile:
            # Read all lines and limit to the first 600
            lines = infile.readlines()[:600]
        
        # Write the first 600 lines to the output file
        with open(output_file, 'w', encoding='latin-1') as outfile:
            outfile.writelines(lines)
        
        print(f"Successfully extracted the first 600 lines to {output_file}")
    except FileNotFoundError:
        print(f"Error: File {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function
extract_first_600_lines(input_file, output_file)


Successfully extracted the first 600 lines to first_600_resumes.txt


## Resume data storage processing

In [4]:
import pandas as pd

# File paths
input_file = "first_600_resumes.txt"  # Path to your input text file
output_file = "resumes_output.csv"  # Path to save the output CSV

def process_resumes(input_file, output_file):
    data = []
    
    try:
        with open(input_file, 'r', encoding='latin-1') as file:
            for i, line in enumerate(file):
                # Split the line into parts based on the delimiter ":::"
                parts = line.strip().split(":::")
                
                # Ensure there are at least 3 parts to avoid errors
                if len(parts) < 3:
                    continue
                
                # Extract the third field (resume content)
                resume_content = parts[2]
                
                # Append to the data list as a tuple of (id, resume_content)
                data.append((i + 1, resume_content))  # Line numbers start from 1
        
        # Create a DataFrame from the data list
        df = pd.DataFrame(data, columns=["id", "resume_content"])
        
        # Save the DataFrame to a CSV file
        df.to_csv(output_file, index=False, encoding='latin-1')
        print(f"Successfully processed and saved to {output_file}")
    
    except FileNotFoundError:
        print(f"Error: File {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function
process_resumes(input_file, output_file)


Successfully processed and saved to resumes_output.csv


In [1]:
import pandas as pd
df = pd.read_csv("resumes_output.csv", encoding='latin-1')
df["resume_content"][0]

'Database Administrator <span class="hl">Database</span> <span class="hl">Administrator</span> Database Administrator - Family Private Care LLC Lawrenceville, GA A self-motivated Production SQL Server Database Administrator who possesses\xa0 strong analytical and problem solving skills. My experience includes SQL Server\xa0 2005, 2008 and 2012, 2014, SSIS, as well as clustering, mirroring, and high\xa0 availability solutions in OLTP environments. I am proficient in database backup,\xa0 recovery, performance tuning, maintenance tasks, security, and consolidation.\xa0 I am confident that I would make a beneficial addition to any company. Over the\xa0 course of my career thus far, I have designed databases to fit a variety of needs,\xa0 successfully ensured the security of those databases, problem-solved in order to meet\xa0 both back-end and front-end needs, installed and tested new versions database\xa0 management systems, customized and installed applications and meticulously\xa0 monit

In [3]:
import pandas as pd
import re

# Define a function to clean the content
def clean_resume(content):
    # Find the position of the last occurrence of '</span>'
    last_span_index = content.rfind('</span>')
    if last_span_index != -1:
        # Remove everything before and including the last '</span>'
        content = content[last_span_index + len('</span>'):]
    # Replace all occurrences of '\xa0' with '\n'
    content = content.replace('\xa0', '\n')
    return content.strip()  # Remove any leading/trailing whitespace

# Apply the cleaning function to the 'content' column
df["resume_content"] = df["resume_content"].apply(clean_resume)


In [1]:
df["resume_content"][0]
df.to_csv("cleaned_resumes.csv", index=False, encoding='latin-1')

NameError: name 'df' is not defined

### Data Preprocssing

1. randomly generate personal information including name, education, etc 
2. break down the resume data into differen sections

In [1]:
import pandas as pd
import openai

openai.api_key = "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"



In [None]:
# Load your CSV file
df = pd.read_csv("cleaned_resumes.csv", encoding='latin-1')

In [49]:
# A list of top universities can be embedded in the prompt or provided as context.
# For simplicity, we can just instruct the model to pick from top universities 
# (the model knows many top universities: Harvard, Stanford, MIT, Oxford, Cambridge, etc.)
# If you want more control, you can embed a list of top universities in the prompt.

def generate_personal_info_only(resume_text):
    print(resume_text[:50])
    prompt = f"""
You are an assistant who is helping to add realistic personal details to a professional resume that currently lacks personal information. The resume content below describes a professional (e.g., a database administrator). You need to randomly generate:

1. A plausible full name, which can come from any race, ethnic group, or nationality. The name should be realistic and not identical to any real famous figure. For example, something like "Arjun Patel" or "Maria Nguyen" or "Kai Matsumoto".

2. A plausible educational background that matches the candidate's experience timeline and professional level. The background should make sense. For example, if the resume suggests about 5 years of working experience up until 2018 as a database administrator, the candidate likely completed their undergraduate degree at least 7-10 years prior (e.g., completed a Bachelor's degree around 2010-2012 if that fits). If they are highly experienced (10+ years), they might have completed education even earlier. The degree should align with their field (Computer Science, Information Technology, Software Engineering, etc.). Include:
   - The name of a top 200 globally ranked university (e.g., Stanford, MIT, Oxford, Cambridge, University of Toronto, ETH Zurich, National University of Singapore, etc.).
   - The degree (Bachelor's, Master's, or PhD) and the major.
   - Attendance years that align reasonably with their career timeline. For instance, if the resume indicates they've been working as a database admin since 2010, maybe they graduated with a Bachelor's around 2007-2009.
   - If they have extensive experience and advanced roles, consider they might have a Master's degree as well. But do not overcomplicate. If the candidate seems mid-level, a Bachelor's might suffice.

3. A realistic and professional-looking email address (e.g., Gmail, Outlook, or Yahoo), ensuring the email address is some randomly combinations of numbers and letters

4. A random city in the United States as their current location (e.g., Seattle, Chicago, Boston).

Make sure the personal info does not conflict with the resume content. Do not invent employers or personal addresses, just education background, email address and location.

Finally, output these personal details at the beginning, in a professional format, followed by a newline. 

Include a randomly generated full name. Do not include the resume content itself in your answer, only produce the personal details, formatted as follows: Name: [Full Name] Email: [Email Address] Location: [City, USA] Education: [Degree, Major, University, Attendance Years]

Use the resume text below as context.

Resume Content:
\"\"\"{resume_text}\"\"\"
"""

    # Call the OpenAI API
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=1000
    )

    return response.choices[0].message.content.strip()

def prepend_personal_info(row):
    personal_info = generate_personal_info_only(row['resume_content'])
    # Prepend the personal info to the resume content locally, without asking the API again
    return f"{personal_info}\n\n{row['resume_content']}"

# Generate the new column by applying the function
df['resume_personal'] = df.apply(prepend_personal_info, axis=1)

print(df['resume_personal'][:10])

Database Administrator - Family Private Care LLC L
sql server database administrator Houston, TX DATA
Oracle Database Administrator - Cognizant hyderaba
and ETL Developer, Business Intelligence ETL Devel
Scrum Master Scrum Master Scrum Master Richmond, V
Oracle Database Administrator - SUNTRUST BANK Fort
Oracle Database Administrator - Caresource Cincinn
/Developer Lead Database Administrator/Developer -
Developer Database Administrator / Database Develo
Oracle Database Administrator - Uber Technologies 
Database Administrator - Booz Allen Hamilton Los A
Senior Oracle Database Administrator - United Heal
Database Administrator - First Tennessee Bank Work
Database Administrator Redmond, WA Work Experience
/SQL DBA SQL Database Administrator/SQL DBA A cert
DBA DBA DBA  Over 6 (Six) years of professional e
/Developer SQL Server Database Administrator/Devel
SQL Server Database Administrator - FEDERAL RESERV
SQL Developer / DBA SQL Developer / DBA SQL Develo
Systems Administrator - Automan

In [52]:
df

Unnamed: 0,id,resume_content,resume_personal
0,1,Database Administrator - Family Private Care L...,Name: Priya Desai \nEmail: priya.desai1990@gm...
1,2,"sql server database administrator Houston, TX ...",Name: Amina Chen \nEmail: amina.chen1985@gmai...
2,3,Oracle Database Administrator - Cognizant hyde...,Name: Amir Khan \nEmail: amirk1234@gmail.com ...
3,4,"and ETL Developer, Business Intelligence ETL D...",Name: Daniel Kim \nEmail: daniel.kim1985@gmai...
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Name: Amina Khan \nEmail: ak12345@gmail.com ...
...,...,...,...
595,596,Consultant Consultant Consultant - Infosys Tec...,Name: Priya Desai \nEmail: priya.desai1985@gm...
596,597,/Developer Database Administrator / Business I...,Name: Amina Torres \nEmail: amina.torres1987@...
597,598,Yoga Instructor Yoga Instructor Foreign Teache...,Name: Mia Chen \nEmail: miachen1987@gmail.com...
598,599,Administrator; Developer; System Analyst Datab...,Name: David Kim \nEmail: dkim1987@gmail.com ...


In [54]:
import pandas as pd

# Concatenate 'resume_personal' to the beginning of 'resume_content'
df['resume_combined'] = df['resume_personal'] + "\n\n" + df['resume_content']

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_resumes_combined.csv', index=False)

print("New column 'resume_combined' has been added and saved to 'updated_resumes_combined.csv'.")


New column 'resume_combined' has been added and saved to 'updated_resumes_combined.csv'.


### Process the job description

collected 558 job descriptions from data analyst, software engineer, data scientist, machine learning engineer and data engineer. 

In [55]:
import pandas as pd
import os

# List of input CSV files
file_list = ['jobs_da.csv', 'jobs_de.csv', 'jobs_ds.csv', 'jobs_mle.csv', 'jobs_sde.csv']

# Initialize an empty list to store the "description" column from each file
descriptions = []

# Loop through each file and extract the "description" column
for file in file_list:
    # Load the CSV file
    df = pd.read_csv(file)
    # Extract the "description" column and add it to the list
    if 'description' in df.columns:
        descriptions.extend(df['description'].tolist())

# Create a new DataFrame with the collected descriptions
output_df = pd.DataFrame({'description': descriptions})




The descriptions have been extracted and saved to 'jd_description.csv'.


In [62]:
# Save the descriptions to a new CSV file
output_file = 'jd_description.csv'
output_df.to_csv(output_file, index=False)

print(f"The descriptions have been extracted and saved to '{output_file}'.")

The descriptions have been extracted and saved to 'jd_description.csv'.


### Extract job details for Resume generating

In [8]:
import os
import openai
import pandas as pd
import json

# Set your OpenAI API key

# Your prompt template
JOB_DETAILS_EXTRACTOR = """ 
<task>
Identify the key details from a job description and company overview to create a structured JSON output. Focus on extracting the most crucial and concise information that would be most relevant for tailoring a resume to this specific job.
</task>

<job_description>
{job_description}
</job_description>

Note: The "keywords", "job_duties_and_responsibilities", and "required_qualifications" sections are particularly important for resume tailoring. Ensure these are as comprehensive and accurate as possible.  
"""

# Read your CSV file (assumes 'job_description' column)
df = pd.read_csv("jd_description.csv")

In [5]:
execution_count = 0
# Function to call the API for a single job description
def extract_job_details(description):
    global execution_count
    execution_count += 1
    print(f"Processing job description {execution_count}...")
    # Format the prompt with the job description
    prompt = JOB_DETAILS_EXTRACTOR.format(job_description=description)
    
    # Call the OpenAI API (using GPT-4 or the "gpt-4" model)
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500,
        temperature=0
    )
    
    # The response content should be the JSON output
    # If the model just returns text, you may need to parse it as JSON.
    # Here, we assume the model returns valid JSON.
    content = response.choices[0].message["content"]
    
    # Optionally, you can try to parse the output as JSON. If it fails,
    # just store the raw text. Adjust error handling as needed.
    try:
        extracted_data = json.loads(content)
    except json.JSONDecodeError:
        # If JSON parsing fails, store the raw text.
        extracted_data = content
    
    return extracted_data

# Apply the extraction to each row in the DataFrame
df["extracted_job_details"] = df["description"].apply(extract_job_details)

# Write the updated DataFrame to a new CSV
#df.to_csv("output_jobs_with_details.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["extracted_job_details"] = new_df["description"].apply(extract_job_details)


### Merge the job description into one

In [None]:
import pandas as pd
import glob
import os

# Path where your CSV files are stored
csv_directory = ""

# Pattern to match all the CSV files you want to merge
csv_files = glob.glob(os.path.join(csv_directory, "output_jobs_with_details*.csv"))

# List to hold individual DataFrames
dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

# Write out to a single CSV file
merged_df.to_csv("merged_output_jobs_with_details.csv", index=False)

print("Merged CSV saved as merged_output_jobs_with_details.csv")


### Create improved resume using GPT 