In [30]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_together import ChatTogether
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# environment variables
import os 
from dotenv import load_dotenv

import fitz  # PyMuPDF
import re

In [31]:
load_dotenv()
API_KEY = os.getenv("API_KEY")

In [7]:
llm = ChatTogether(
    together_api_key=API_KEY,
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

In [71]:
class Person(BaseModel):
    """The skill from a candidate's CV."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    skills: Optional[list] = Field(default=None, description="The hard/soft skills acquired by the candidate")
    experiences: Optional[list] = Field(default=None, description="level of experience in a particular field")
    locations: Optional[list] = Field(default=None, description="The cities, contry or company where the person has worked or studied")

In [72]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "Never return the examples placeholder, try to use it as a reference only."
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [73]:
runnable = prompt | llm.with_structured_output(schema=Person)

In [80]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def clean_text(text):
    # Remove newline characters
    text = re.sub(r'\n', ' ', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
 
    # Remove dates in various formats (e.g., MM/DD/YYYY, YYYY-MM-DD)
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', '', text)
    text = re.sub(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove multiple spaces created from above replacements
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


# Example usage
def getAnswersFromCV(pdf_path):
    cv_text = extract_text_from_pdf(pdf_path)
    #result = runnable.invoke({"text": cv_text})
    #cv_text = preprocess_text(cv_text)
    return cv_text
    #print(cv_text)
    #print(preprocessed_text)

pdf_path = 'cvs/CV_JoseFonte.pdf'
result = getAnswersFromCV(pdf_path)

In [81]:
result

'José Pedro Fonte\nBraga, Portugal\nĦ +351 931426314\n|\nć josebfonte@gmail.com\n|\nu www.josefonte.pt\n|\n^ github.com/josefonte\n|\n]\nlinkedin.com/in/jose-pedro-fonte/\nPersonal Profile\nRecently graduated with a BSc degree in Software Engineering and now pursing a Masters. Always seeking new challenges and innovative ways\nto apply technical expertise. Beyond coding, I enjoy sports, music and podcast, cinema and reading up on cutting‑edge tech trends.\nEducation\nUniversidade do Minho\nBraga, Portugal\nMSc in Software Engineering\nSept 2023 ‑ Present\n• At the moment, I’m attending my first year in Software Engineering, with my main interests being Distributed Systems, Software Develop‑\nment(Mobile & Web), AI and Product Design.\n• Courses: High Performance Computing, AI & ML, Cloud Applications and Services, Formal Methods, Software Development, Network Services\nEngineering\n• Distributed Systems Profile : Paradigms of Distributed Systems | Large Scale Distributed Systems | Faul

In [13]:
result.skills

['python',
 'java',
 'c',
 'haskell',
 'prolog',
 'html',
 'css',
 'c',
 'matlab',
 'sql',
 'net',
 'javascript',
 'mongodb',
 'nextjs',
 'postgres',
 'illustrator',
 'photoshop']