In [5]:
!pip install groq langchain-community pdfplumber



In [6]:
import json
from groq import Groq
from langchain_community.document_loaders import PDFPlumberLoader
import os

In [7]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GROQ_API_KEY")


In [18]:
system_prompt = """You are an AI assistant designed to extract structured resume data.
                   Always respond with a strictly valid JSON object. Use `null` for missing values,
                   ensuring compliance with JSON standards. Do not include explanations,
                   comments, or any additional text outside the JSON structure.
                """



human_prompt = """
             **Task:** Extract key information from the following resume text.

            **Resume Text:**
            {context}

            **Instructions:**
            Please extract the following information and format it in a clear structure:

            1. **Contact Information:**
            - Name:
            - Email:
            - Phone Number:
            - Website/Portfolio/LinkedIn:
            - Github Profile:

            2. **Education:**
            - Institution Name:
            - Degree:
            - Graduation Date:

            3. **Experience:**
            - Job Title:
            - Company Name:
            - Location:
            - Dates of Employment:
            - Description:

            5. **Skills:**
            - Skills:

            **Question:**
            Extract this information as a structured and valid JSON object. Use `null` for missing or unavailable valuesDo not include explanations,
                   comments, or any additional text outside the JSON structure.
        """



# Initialize Groq client once
client = Groq(api_key=secret_value_0) 

# Your system and human prompts (define these before the loop)
system_prompt = system_prompt
human_prompt = human_prompt

def clean_resume_data(json_data):
    """Clean and standardize the resume JSON data"""
    # 1. Remove duplicate skills (case insensitive)
    if 'skills' in json_data and isinstance(json_data['skills'], list):
        seen_skills = set()
        unique_skills = []
        for skill in json_data['skills']:
            lower_skill = skill.strip().lower()
            if lower_skill not in seen_skills:
                seen_skills.add(lower_skill)
                unique_skills.append(skill.strip())
        json_data['skills'] = unique_skills
    
    # 2. Standardize website/portfolio fields
    website_aliases = [
        'website', 'portfolio', 'linkedin', 
        'Website', 'Portfolio', 'LinkedIn',
        'personal_website', 'webpage'
    ]
    
    # Find the first existing website-related field
    website_field = None
    website_value = None
    for field in website_aliases:
        if field in json_data:
            website_field = field
            website_value = json_data[field]
            break
    
    # Standardize to 'website' if we found a value
    if website_field and website_value:
        # Remove all website-related fields
        for field in website_aliases:
            if field in json_data:
                del json_data[field]
        # Add the standardized field
        json_data['website'] = website_value.strip()
    
    return json_data

def extract_text_pdf(pdf_path):
    loader = PDFPlumberLoader(pdf_path)
    docs = loader.load()
    text = ''
    for doc in docs:
        text += doc.page_content
    return text

def process_resume(pdf_path, index):
    # Extract text from PDF
    context = extract_text_pdf(pdf_path)
    
    # Call LLM API
    completion = client.chat.completions.create(
        model="meta-llama/llama-4-scout-17b-16e-instruct",
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": human_prompt.format(context=context)
            }
        ],
        temperature=1,
        max_completion_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )
    
    # Process the streamed output
    output = ""
    for chunk in completion:
        content = chunk.choices[0].delta.content or ""
        print(content, end="")
        output += content
    
    # Extract JSON content
    json_start = output.find('{')
    json_end = output.rfind('}') + 1
    json_content = output[json_start:json_end]
    
    # Save JSON to file
    try:
        json_data = json.loads(json_content)
        
        # Clean and standardize the data
        cleaned_data = clean_resume_data(json_data)
        
        output_filename = f"Resume_data_pdf_{index}.json"
        with open(output_filename, 'w') as f:
            json.dump(cleaned_data, f, indent=2)
        print(f"\nJSON saved successfully to {output_filename}")
        return True
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON for {pdf_path}: {e}")
        # Save raw output for debugging
        with open(f"Raw_output_pdf_{index}.json", 'w') as f:
            f.write(output)
        return False



# # # Correct paths (/) # working
# pdf_path1 = "/kaggle/input/resume-17b/BHARATH_RESUME-complete.pdf"
# pdf_path2 = "/kaggle/input/resume-17b/Bharathkumar-Parunandula-Resume.pdf"   
# pdf_path3 = "/kaggle/input/retsdts/BHARATH_RESUME-DS-FINAL.pdf"   
# # pdf_path4 = "PARSER-APP INPUT/50328713.rank4.pdf"
# # pdf_path5 = "PARSER-APP INPUT/17823436.rank5.pdf"


# import os

pdf_paths = []
for dirname, _, filenames in os.walk('/kaggle/input/testlres'):
    for filename in filenames:
        if filename.lower().endswith('.pdf'):
            pdf_paths.append(os.path.join(dirname, filename))

print("Found PDF files:")
for path in pdf_paths:
    print(path)

# Process each PDF separately
# pdf_paths = [pdf_path1,pdf_path2,pdf_path3]
for i, pdf_path in enumerate(pdf_paths, start=1):
    print(f"\nProcessing resume {i}: {pdf_path}")
    process_resume(pdf_path, i)
    print(f"Completed processing {pdf_path}")

Found PDF files:
/kaggle/input/testlres/cover letter.pdf
/kaggle/input/testlres/BHARATH_RESUME-DS_final.pdf
/kaggle/input/testlres/BHARATH-RESUME V3.pdf
/kaggle/input/testlres/BHARATH_RESUME-DS-FINAL.pdf
/kaggle/input/testlres/Bharathkumar-Parunandula-Resume.pdf
/kaggle/input/testlres/Bharathkumar-Parunandula-FlowCV-Resume-20250704.pdf
/kaggle/input/testlres/BHARATH_RESUME-DS-3.pdf
/kaggle/input/testlres/Bharathkumar-Parunandula-resume-final.pdf
/kaggle/input/testlres/Bharathkumar-Parunandula-FlowCV-Resume-20250619.pdf
/kaggle/input/testlres/BHARATH-RESUME-DS.pdf
/kaggle/input/testlres/BHARATH_RESUME-DS2.pdf
/kaggle/input/testlres/BHARATH_RESUME-complete.pdf
/kaggle/input/testlres/BHARATH-RESUME V2.pdf
/kaggle/input/testlres/cover1.pdf
/kaggle/input/testlres/Bharathkumar Original Resume 2.pdf
/kaggle/input/testlres/BHARATH-RESUME.pdf
/kaggle/input/testlres/BHARATH_RESUMEDSv4.pdf
/kaggle/input/testlres/BHARATH_RESUMEDS.pdf

Processing resume 1: /kaggle/input/testlres/cover letter.pdf
``

PdfminerException: No /Root object! - Is this really a PDF?

In [14]:
# isuue 1 - can't parse resume that stetches into two pages properly