In [60]:
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access the OPENAI_TEST_KEY environment variable
api_key = os.getenv('OPENAI_TEST_KEY')

In [7]:
# Read the pdf with the CV information
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("./data/CV_AntonioOchotorena_092024.pdf")
pages = loader.load()
pages

[Document(metadata={'source': './data/CV_AntonioOchotorena_092024.pdf', 'page': 0}, page_content='ANTONIO OCHOTORENA  LAYNEZ  \n+34 636 426 538  • antonioochotorena@gmail.com  • LinkedIn  • GitHub  \n \nSummary   \n \nActively seeking a full -time opportunity in data and AI in Amsterdam.  My background in consulting \nand data science has equipped me with the skills to excel in a position where effective \ncommunication and technical expertise are essential . What sets me apart is my strong drive to learn, \nproactive approach to solving complex problems, and commitment to fostering a collaborative team \nenvironment where we support and help each other to achieve shared goals.  \n \nExperience   \n \nInfosys  Instep – Global Internship Program  Bangalore, India  \nAI Software Engineer  06/2024 – Now  \n \n• Developed a proprietary information retrieval module utilizing Large Language Models \n(LLMs) and OCR technologies to extract healthcare forms, achieving substantial cost savings. 

In [8]:
# Split the text 
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=150,
    length_function=len
)

In [9]:
docs = text_splitter.split_documents(pages)

In [10]:
docs

[Document(metadata={'source': './data/CV_AntonioOchotorena_092024.pdf', 'page': 0}, page_content='ANTONIO OCHOTORENA  LAYNEZ  \n+34 636 426 538  • antonioochotorena@gmail.com  • LinkedIn  • GitHub  \n \nSummary   \n \nActively seeking a full -time opportunity in data and AI in Amsterdam.  My background in consulting \nand data science has equipped me with the skills to excel in a position where effective \ncommunication and technical expertise are essential . What sets me apart is my strong drive to learn,'),
 Document(metadata={'source': './data/CV_AntonioOchotorena_092024.pdf', 'page': 0}, page_content='communication and technical expertise are essential . What sets me apart is my strong drive to learn, \nproactive approach to solving complex problems, and commitment to fostering a collaborative team \nenvironment where we support and help each other to achieve shared goals.  \n \nExperience   \n \nInfosys  Instep – Global Internship Program  Bangalore, India  \nAI Software Engineer 

# store it to Chroma DB

In [11]:
# Store it into a ChromaDB database
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(api_key=api_key)

In [12]:
from langchain.vectorstores import Chroma

PERSIST_DIR = './chroma/'

In [14]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=PERSIST_DIR
)

# Prompt

### TODO create a Summariser of job requirements

In [67]:
job_description = """
ABN AMRO Traineeship - Data Scientist
At a glance
As a data scientist trainee you will primarily learn how to translate business challenges into technical predictive models that support a better decision making. As data scientist you will be well-equipped to store and clean large amounts of data, explore data sets to identify valuable insights, build predictive models, and run data science projects from end to end. With this Traineeship you will take the first step in your career as a data scientist to develop yourself personally and professionally!

Your job
 Research and develop statistical models for analysis;
Understand company needs and devise possible solutions by collaborating with product management/owners and the DevOps team;
Communicate results and statistical concepts to key business stakeholders;
Use appropriate databases and project designs to optimize joint development efforts;
Develop custom data models and algorithms;
Build processes and tools to help monitor and analyse performance and data accuracy;
Use predictive modelling to enhance and optimize customer experiences, revenue generation, ad targeting, and more;
Develop company A/B testing framework and test model quality
Your profile
As a data scientist trainee you should have a degree in mathematics, statistics, physics, econometrics or business, with an analytics focus. You must have strong skills in math, science, programming, databases, modelling, and predictive analytics.

Who are you?
Entrepreneurial spirit, creative mind-set and capable of assessing and evaluating risks;
Knowledge of or at least interested in technology and how this supports business improvements;
Thrives in an international context;
Thinks in terms of what is possible, sees opportunities and takes a solutions-oriented approach;
Self-starter who readily assumes responsibility;
You have a valid EU work permit. 

What’s your experience?
•   Experience working with R or Python;
•   Experience in statistical, data mining and machine learning techniques (like boosting, generalized linear models/regression, and social network analysis);
•   Experience visualizing and presenting data;
•   Strong written and verbal communication skills;
•   A proactive approach, with an ability to manage multiple priorities simultaneously;
•   No more than 3 years of work experience (internships do not count as work experience), this is a graduate programme. 

Nice to have:
•   Experience working with SQL or Java;
•   Experience with cloud services, preferably Azure Cloud Services;
•   Experience analyzing data from third-party providers like Adobe Analytics, Google Analytics etc.;
•   Experience working with distributed data and computing tools like Hadoop, Hive, Map/Reduce, MySQL, and Spark;
•   Familiarity with the Agile methodology.
"""

In [69]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import os
import json

job_offer_prompt = PromptTemplate(
    input_variables=["job_description"],
    template="""
    You are a professional job offer analyzer. Your task is to extract every tool, technology, and skill mentioned in the job description, grouping them into tech stack and soft skills categories. Follow these instructions carefully:

    1. Use ONLY the EXACT WORDS and phrases from the job description.
    2. Group related technologies or skills, separating them with commas if they appear separately in the text.
    3. Assign relevancy points on a scale of 1-100 for each extracted item.
    4. Ensure the total points for all items add up to 100.
    5. Do not invent or infer any skills or technologies not explicitly mentioned.
    6. Return the results in the following JSON format:

    {{
      "tech_stack": {{
        "Technology/Tool 1": Points (1-100),
        "Technology/Tool 2, Related Technology": Points (1-100),
        ...
      }},
      "soft_skills": {{
        "Soft Skill 1": Points (1-100),
        "Soft Skill 2": Points (1-100),
        ...
      }}
    }}

    Job Description: {job_description}
    """
)

def extract_job_requirements(job_description, api_key):
    # Use ChatOpenAI as the LLM for Langchain
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=api_key)
    
    # Create an LLMChain
    chain = LLMChain(llm=llm, prompt=job_offer_prompt)
    
    # Run the chain with the job description
    result = chain.run(job_description)
    
    # Parse the result as JSON
    try:
        parsed_result = json.loads(result)
        return parsed_result
    except json.JSONDecodeError:
        return {"error": "Failed to parse the result as JSON"}

# Ensure API key is set
api_key = os.getenv("OPENAI_TEST_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set")

result = extract_job_requirements(job_description=job_description, api_key=api_key)

print(json.dumps(result, indent=2))

{
  "tech_stack": {
    "R or Python": 25,
    "SQL, Java": 15,
    "Hadoop, Hive, Map/Reduce, MySQL, Spark, distributed data and computing tools": 10,
    "Azure Cloud Services": 10,
    "Adobe Analytics, Google Analytics": 10
  },
  "soft_skills": {
    "Entrepreneurial spirit, risk assessment": 15,
    "Creative mindset": 10,
    "International context": 10,
    "Problem-solving, solutions-oriented approach": 15,
    "Self-starter, responsibility": 15,
    "Effective communication": 15,
    "Proactive, multitasking": 10
  }
}


# Question

In [23]:
question = f"""
Use the following pieces of context to create a cover letter for the following job offer. 
{offer_requirements}
If the applicant has knowledge gaps from the job offer: 
    1. Compare it to similar skills he has.
    2. If there are no similar exclude them from the cover letter and metion them at the end.

Don't try to make up an answer. 
Use a letter format with three paragraphs maximum. 
Keep the answer clear, concise and semi-formal as possible.
Do not over extend with adjectives.
"""

In [24]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question
{context}
Question: {question}
Cover letter: Fill in the letter here
Knowledge Gaps: Add Knowledge Gaps if Any
"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [26]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0, api_key = os.getenv("OPENAI_TEST_KEY"))
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [27]:
result = qa_chain({"query": question})

  result = qa_chain({"query": question})


In [29]:
print(result["result"])

Dear Hiring Manager,

I am writing to express my interest in the AI, MLOps, and Backend Developer position at your company. With my experience as an AI Software Engineer at Infosys, where I developed proprietary information retrieval modules using Large Language Models, I believe I have the technical expertise required for this role. My proactive approach to problem-solving and commitment to fostering a collaborative team environment align well with the values of your organization.

In my role at Infosys, I successfully implemented a RAG model for HR platforms during a hackathon, showcasing my ability to work on complex projects and deliver innovative solutions. Additionally, my certification as a Professional SCRUM master and completion of Stanford Machine Learning and Deeplearning.ai Deep Learning Specialization courses demonstrate my dedication to continuous learning and growth in the field of AI.

While I do not have direct experience in MLOps, I am confident in my ability to quick

In [30]:
result

{'query': "\nUse the following pieces of context to create a cover letter for the following job offer. \n\n<req1> AI experience </req1>\n<req2> MLOps </req2>\n<req3> Backend developer </req3>\n\nIf the applicant has knowledge gaps from the job offer: \n    1. Compare it to similar skills he has.\n    2. If there are no similar exclude them from the cover letter and metion them at the end.\n\nDon't try to make up an answer. \nUse a letter format with three paragraphs maximum. \nKeep the answer clear, concise and semi-formal as possible.\nDo not over extend with adjectives.\n",
 'result': 'Dear Hiring Manager,\n\nI am writing to express my interest in the AI, MLOps, and Backend Developer position at your company. With my experience as an AI Software Engineer at Infosys, where I developed proprietary information retrieval modules using Large Language Models, I believe I have the technical expertise required for this role. My proactive approach to problem-solving and commitment to fosterin

# Query

In [12]:
# def get_completion(prompt, client):
    
#     completion = client.chat.completions.create(
#                     model="gpt-3.5-turbo-0125",
#                     messages=[
#                         {"role": "system", "content": "You are a helpful assistant."},
#                         {"role": "user", "content": prompt}
#                     ],
#                     max_tokens=1000,
#                     temperature=0,
#                 )
#     return completion

In [13]:
text = pages[0].page_content

prompt = f"""
Please follow the following format:
<role 1> <reasons based on experience> <areas of improvement towards role 1>
<role 2> <reasons based on experience> <areas of improvement towards role 2>
<role 3> <reasons based on experience> <areas of improvement towards role 3>

Do not do more than 3 roles
```{text}```
"""
result = get_completion(prompt, client)


In [14]:
print(result.choices[0].message.content)

Data Engineer at Infosys Instep in Amsterdam
Antonio has a strong background in data engineering, as evidenced by his successful DWH platform migration for Beam Suntory Spain and the development of a Support Decision System using Deep Learning at Ramón Y Cajal Hospital. He has experience in leading projects, implementing reporting tools, and designing NLP pipelines for biomedical document classification. Antonio's technical skills in Python, R, SQL, and various data tools make him well-suited for a Data Engineer role.

Areas of improvement:
- Antonio could focus on enhancing his knowledge of cloud platforms like AWS and Azure to stay updated with the latest technologies in data engineering.
- Developing expertise in big data technologies like Spark and Hadoop would further strengthen his profile for data engineering roles.

AI Software Engineer in Amsterdam
Antonio's experience in developing an information retrieval module using Large Language Models and OCR technologies at Infosys Ins