In [1]:
import os
import streamlit as st
import PyPDF2
import re
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain_core.caches import InMemoryCache
import langchain
from langchain_openai import ChatOpenAI
from langchain_core.globals import set_llm_cache


# Set OpenAI API key from environment variable
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [5]:
def extract_info_from_pdf_new(pdf_file):
    # print("file_path is :",file_path)
    # read_file(file_path)
    # Read PDF content
    reader = PyPDF2.PdfReader(pdf_file)
    resume_text = ""
    for page in reader.pages:
        resume_text += page.extract_text()

    openai_llm = ChatOpenAI(model="gpt-4o-mini", max_tokens=1000, temperature=0.7)

    prompt_template_resume = PromptTemplate(
        input_variables=["resume_text"],
        template="""
        Extract the following sections from the given resume: skills, experience, and projects. If any section is not clearly defined, extract the most relevant information related to that section.

        Resume:
        {resume_text}

        Please provide the extracted information in the following format:

        Skills:
        [Extracted skills information]

        Experience:
        [Extracted experience information]

        Projects:
        [Extracted projects information]
        """,
    )

    # Set up LLMChain for the combined extraction
    extraction_chain = LLMChain(llm=openai_llm, prompt=prompt_template_resume)

    # Get the extracted information by running the chain
    extracted_info = extraction_chain.run({"resume_text": resume_text})

    # Split the extracted information into sections
    sections = re.split(r"\n\s*\n", extracted_info)

    skills_section = next(
        (s for s in sections if s.lower().strip().startswith("skills:")), ""
    )
    experience_section = next(
        (s for s in sections if s.lower().strip().startswith("experience:")), ""
    )
    projects_section = next(
        (s for s in sections if s.lower().strip().startswith("projects:")), ""
    )

    # Extract specific skills
    skills_list = re.findall(
        r"\b(Java|Python|C|C\+\+|JavaScript|MySQL|PostgreSQL|HTML|CSS|AWS|Django|React|NodeJS|ExpressJS|Docker|Langchain|fastapi|flask|MongoDB|Machine Learning)\b",
        skills_section,
        re.IGNORECASE,
    )

    # If skills_list is empty, use all words in the skills section as skills
    if not skills_list:
        # skills_list = re.findall(r"\b\w+\b", skills_section.replace("Skills:", ""))
        skills_list = ["C++", "Python", "Java"]

    return (
        list(set(skills_list)),
        experience_section.replace("Experience:", "").strip(),
        projects_section.replace("Projects:", "").strip(),
    )


In [8]:
skill_list,b,c=extract_info_from_pdf_new("Ashish_Resume_ATS.pdf")

In [9]:
skill_list

['Python',
 'C',
 'ExpressJS',
 'MongoDB',
 'Javascript',
 'FastAPI',
 'HTML',
 'Docker',
 'MySQL',
 'PostgreSQL',
 'CSS',
 'Flask']