In [None]:
import os
import json
import pdfplumber
from dotenv import load_dotenv
from tqdm import tqdm  
import openai 


load_dotenv()



def process_with_chatgpt(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    schema_example = json.dumps({
        "basics": {
            "name": "John Doe",
            "label": "Programmer",
            "image": "",
            "email": "john@gmail.com",
            "phone": "(912) 555-4321",
            "url": "https://johndoe.com",
            "summary": "A summary of John Doe…",
            "location": {
                "address": "2712 Broadway St",
                "postalCode": "CA 94115",
                "city": "San Francisco",
                "countryCode": "US",
                "region": "California"
            },
            "profiles": [ {
                "network": "Twitter",
                "username": "john",
                "url": "https://twitter.com/john"
            }]
        },
        "work": [ {
            "name": "Company",
            "position": "President",
            "url": "https://company.com",
            "startDate": "2013-01-01",
            "endDate": "2014-01-01",
            "summary": "Description…",
            "highlights": [
                "Started the company"
            ]
        }],
        "volunteer": [ {
            "organization": "Organization",
            "position": "Volunteer",
            "url": "https://organization.com/",
            "startDate": "2012-01-01",
            "endDate": "2013-01-01",
            "summary": "Description…",
            "highlights": [
                "Awarded 'Volunteer of the Month'"
            ]
        }],
        "education": [ {
            "institution": "University",
            "url": "https://institution.com/",
            "area": "Software Development",
            "studyType": "Bachelor",
            "startDate": "2011-01-01",
            "endDate": "2013-01-01",
            "score": "4.0",
            "courses": [
                "DB1101 - Basic SQL"
            ]
        }],
        "awards": [ {
            "title": "Award",
            "date": "2014-11-01",
            "awarder": "Company",
            "summary": "There is no spoon."
        }],
        "certificates": [ {
            "name": "Certificate",
            "date": "2021-11-07",
            "issuer": "Company",
            "url": "https://certificate.com"
        }],
        "publications": [ {
            "name": "Publication",
            "publisher": "Company",
            "releaseDate": "2014-10-01",
            "url": "https://publication.com",
            "summary": "Description…"
        }],
        "skills": [ {
            "name": "Web Development",
            "level": "Master",
            "keywords": [
                "HTML",
                "CSS",
                "JavaScript"
            ]
        }],
        "languages": [ {
            "language": "English",
            "fluency": "Native speaker"
        }],
        "interests": [ {
            "name": "Wildlife",
            "keywords": [
                "Ferrets",
                "Unicorns"
            ]
        }],
        "references": [ {
            "name": "Jane Doe",
            "reference": "Reference…"
        }],
        "projects": [ {
            "name": "Project",
            "startDate": "2019-01-01",
            "endDate": "2021-01-01",
            "description": "Description...",
            "highlights": [
                "Won award at AIHacks 2016"
            ],
            "url": "https://project.com/"
        }]
    }, indent=2)
    
    prompt = f"""
    Convert the following resume text into a structured JSON CV format. The schema should match this format:
    {schema_example}
    
    Resume Text:
    {text}
    """
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model="gpt-4-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    result = response.choices[0].message.content
    return result

def extract_and_format_resume(pdf_path):
    text = extract_information(pdf_path)
    structured_data = process_with_chatgpt(text)
    return structured_data

def process_pdfs_in_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    subdirs = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

    for subdir in tqdm(subdirs, desc="Processing Folders"):
        subdir_path = os.path.join(input_folder, subdir)
        pdf_files = [f for f in os.listdir(subdir_path) if f.endswith(".pdf")]

        for file_name in pdf_files:
            pdf_path = os.path.join(subdir_path, file_name)
            try:
                structured_data = extract_and_format_resume(pdf_path)
                output_file = os.path.join(output_folder, subdir, f"{os.path.splitext(file_name)[0]}.json")
                os.makedirs(os.path.dirname(output_file), exist_ok=True)
                with open(output_file, "w") as f:
                    f.write(structured_data)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Example usage
input_folder = "../data/data/"
output_folder = "../output_json"
process_pdfs_in_folder(input_folder, output_folder)


In [None]:
import os
import json
import pdfplumber
from dotenv import load_dotenv
from tqdm import tqdm  
import openai 


load_dotenv()

def extract_information(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text

def process_with_chatgpt(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    schema_example = json.dumps({
        "basics": {
            "name": "John Doe",
            "label": "Programmer",
            "image": "",
            "email": "john@gmail.com",
            "phone": "(912) 555-4321",
            "url": "https://johndoe.com",
            "summary": "A summary of John Doe…",
            "location": {
                "address": "2712 Broadway St",
                "postalCode": "CA 94115",
                "city": "San Francisco",
                "countryCode": "US",
                "region": "California"
            },
            "profiles": [ {
                "network": "Twitter",
                "username": "john",
                "url": "https://twitter.com/john"
            }]
        },
        "work": [ {
            "name": "Company",
            "position": "President",
            "url": "https://company.com",
            "startDate": "2013-01-01",
            "endDate": "2014-01-01",
            "summary": "Description…",
            "highlights": [
                "Started the company"
            ]
        }],
        "volunteer": [ {
            "organization": "Organization",
            "position": "Volunteer",
            "url": "https://organization.com/",
            "startDate": "2012-01-01",
            "endDate": "2013-01-01",
            "summary": "Description…",
            "highlights": [
                "Awarded 'Volunteer of the Month'"
            ]
        }],
        "education": [ {
            "institution": "University",
            "url": "https://institution.com/",
            "area": "Software Development",
            "studyType": "Bachelor",
            "startDate": "2011-01-01",
            "endDate": "2013-01-01",
            "score": "4.0",
            "courses": [
                "DB1101 - Basic SQL"
            ]
        }],
        "awards": [ {
            "title": "Award",
            "date": "2014-11-01",
            "awarder": "Company",
            "summary": "There is no spoon."
        }],
        "certificates": [ {
            "name": "Certificate",
            "date": "2021-11-07",
            "issuer": "Company",
            "url": "https://certificate.com"
        }],
        "publications": [ {
            "name": "Publication",
            "publisher": "Company",
            "releaseDate": "2014-10-01",
            "url": "https://publication.com",
            "summary": "Description…"
        }],
        "skills": [ {
            "name": "Web Development",
            "level": "Master",
            "keywords": [
                "HTML",
                "CSS",
                "JavaScript"
            ]
        }],
        "languages": [ {
            "language": "English",
            "fluency": "Native speaker"
        }],
        "interests": [ {
            "name": "Wildlife",
            "keywords": [
                "Ferrets",
                "Unicorns"
            ]
        }],
        "references": [ {
            "name": "Jane Doe",
            "reference": "Reference…"
        }],
        "projects": [ {
            "name": "Project",
            "startDate": "2019-01-01",
            "endDate": "2021-01-01",
            "description": "Description...",
            "highlights": [
                "Won award at AIHacks 2016"
            ],
            "url": "https://project.com/"
        }]
    }, indent=2)
    
    prompt = f"""
    Convert the following resume text into a structured JSON CV format. Leave the feilds that are not given empty and don't add anything not mentioned. Return a json file as specificed below and add nothing on it like comments or any other additional information. The schema should match this format:
    {schema_example}
    
    Resume Text:
    {text}
    """
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    result = response.choices[0].message.content
    return result

def extract_and_format_resume(pdf_path):
    text = extract_information(pdf_path)
    structured_data = process_with_chatgpt(text)
    return structured_data

structured_data = extract_and_format_resume("../data/data/ACCOUNTANT/10554236.pdf")
structured_data

In [5]:
import os
import sys
import os

# Add the parent directory to sys.path
notebook_dir = os.path.abspath(os.getcwd())
parent_dir = os.path.abspath(os.path.join(notebook_dir, "../"))
sys.path.append(parent_dir)

In [8]:
import os
import json
import pdfplumber
from dotenv import load_dotenv
from tqdm import tqdm  # Import tqdm for progress tracking
import openai  # Ensure this library is installed and properly configured

# Load environment variables
load_dotenv()

def extract_information(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text

def process_with_chatgpt(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    schema_example = json.dumps({
        "basics": {
            "name": "John Doe",
            "label": "Programmer",
            "image": "",
            "email": "john@gmail.com",
            "phone": "(912) 555-4321",
            "url": "https://johndoe.com",
            "summary": "A summary of John Doe…",
            "location": {
                "address": "2712 Broadway St",
                "postalCode": "CA 94115",
                "city": "San Francisco",
                "countryCode": "US",
                "region": "California"
            },
            "profiles": [ {
                "network": "Twitter",
                "username": "john",
                "url": "https://twitter.com/john"
            }]
        },
        "work": [ {
            "name": "Company",
            "position": "President",
            "url": "https://company.com",
            "startDate": "2013-01-01",
            "endDate": "2014-01-01",
            "summary": "Description…",
            "highlights": [
                "Started the company"
            ]
        }],
        "volunteer": [ {
            "organization": "Organization",
            "position": "Volunteer",
            "url": "https://organization.com/",
            "startDate": "2012-01-01",
            "endDate": "2013-01-01",
            "summary": "Description…",
            "highlights": [
                "Awarded 'Volunteer of the Month'"
            ]
        }],
        "education": [ {
            "institution": "University",
            "url": "https://institution.com/",
            "area": "Software Development",
            "studyType": "Bachelor",
            "startDate": "2011-01-01",
            "endDate": "2013-01-01",
            "score": "4.0",
            "courses": [
                "DB1101 - Basic SQL"
            ]
        }],
        "awards": [ {
            "title": "Award",
            "date": "2014-11-01",
            "awarder": "Company",
            "summary": "There is no spoon."
        }],
        "certificates": [ {
            "name": "Certificate",
            "date": "2021-11-07",
            "issuer": "Company",
            "url": "https://certificate.com"
        }],
        "publications": [ {
            "name": "Publication",
            "publisher": "Company",
            "releaseDate": "2014-10-01",
            "url": "https://publication.com",
            "summary": "Description…"
        }],
        "skills": [ {
            "name": "Web Development",
            "level": "Master",
            "keywords": [
                "HTML",
                "CSS",
                "JavaScript"
            ]
        }],
        "languages": [ {
            "language": "English",
            "fluency": "Native speaker"
        }],
        "interests": [ {
            "name": "Wildlife",
            "keywords": [
                "Ferrets",
                "Unicorns"
            ]
        }],
        "references": [ {
            "name": "Jane Doe",
            "reference": "Reference…"
        }],
        "projects": [ {
            "name": "Project",
            "startDate": "2019-01-01",
            "endDate": "2021-01-01",
            "description": "Description...",
            "highlights": [
                "Won award at AIHacks 2016"
            ],
            "url": "https://project.com/"
        }]
    }, indent=2)
    
    prompt = f"""
    Convert the following resume text into a structured JSON CV format. Leave the feilds that are not given empty and don't add anything not mentioned. Return a json file as specificed below and add nothing on it. The schema should match this format:
    {schema_example}
    
    Resume Text:
    {text}
    """
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    result = response.choices[0].message.content
    return result

def extract_and_format_resume(pdf_path):
    text = extract_information(pdf_path)
    structured_data = process_with_chatgpt(text)
    return structured_data

def process_pdfs_in_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    subdirs = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

    for subdir in tqdm(subdirs, desc="Processing Folders"):
        subdir_path = os.path.join(input_folder, subdir)
        pdf_files = [f for f in os.listdir(subdir_path) if f.endswith(".pdf")]

        for file_name in pdf_files:
            pdf_path = os.path.join(subdir_path, file_name)
            try:
                structured_data = extract_and_format_resume(pdf_path)
                output_file = os.path.join(output_folder, subdir, f"{os.path.splitext(file_name)[0]}.json")
                os.makedirs(os.path.dirname(output_file), exist_ok=True)
                with open(output_file, "w") as f:
                    f.write(structured_data)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Example usage
input_folder = "../data/data/ACCOUNTANT"
output_folder = "../output_json_new_try"
process_pdfs_in_folder(input_folder, output_folder)


Processing Folders: 0it [00:00, ?it/s]


In [11]:
# import requests

# url = "http://127.0.0.1:8000/extract_top_k_job/"
# file_path = r"C:\\Users\\deepa\\Downloads\\cv matching\\data\\data\\ACCOUNTANT\\10554236.pdf"

# with open(file_path, "rb") as file:
#     response = requests.post(
#         url,
#         files={"file": file},
#         data={"top_k": 2}
#     )

# print(response.json())


{'results': [[419, 'Holman Frenia Allison, PC', 'brdisplaynonecss ul  limarginleftcss lipadding\n\nholman frenia allison pc certified public accountants and consultants hfa is headquartered in central new jersey the firm services a diverse client base within the tristate area providing highly personalized and comprehensive accounting audit tax and business consulting services\n\nwe are looking for a staff accountant i to work in our lakewood nj office this position will work for our audit department\n\nin this role you will\n reconcile client records to trial balance which may include analytical procedures and performs schedule computations of prepaid accruals property equipment and calculates depreciation expense\n complete testing on internal controls single audit and other various funds\n confirm cash legal insurance revenue etc accounts\n use the firms software programs to automate the process from trial balance and financial statements\n perform other accounting auditing tax and c