# Analyse the data

In [1]:
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import pandas as pd
from os import listdir

import sys
sys.path.append('..')
from src.database.csv import save_to_csv

### Classes and Fucntions

In [2]:
RAW_FOLDER = '../data/raw'
FINAL_FOLDER = '../data/final'

In [None]:
class JobInfo(BaseModel):
    title: str
    city: str
    work_mode: str
    education_level: list[str]
    min_experience_years: int
    skills: list[str]
    nice_to_have: list[str]
    specializations: list[str]
    tools_and_frameworks: list[str]
    responsibilities_summary: list[str]
    description: str

In [4]:
def generate_responses(system_prompt, user_prompt, model="llama3.2"):
    response: ChatResponse = chat(
        model=model, 
        messages=[
            { "role": "system", "content": system_prompt },
            { "role": "user", "content": user_prompt }
        ],
        format=JobInfo.model_json_schema()
    )
    return response.message.content

### Test the format

In [5]:
df = pd.read_csv('../data/raw/ai developer.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Position,Company,Location,Work Mode,Description
0,0,Gen AI Engineer,Rivago Infotech Inc,Canada,Remote,About the job\r\nJob Description:\r\nWe are se...
1,1,NLP Engineer,Insilico Medicine,,,About the job\r\nAbout Insilico \r\nInsilico M...
2,2,AI Engineer,Ares,,,About the job\r\nAbout Ares\r\nThe Problem\r\n...
3,3,Artificial Intelligence Software Developer,MaintainX,,,About the job\r\nMaintainX is the world's lead...
4,4,AI Adoption Specialist / AI Enablement Consultant,Confidential,"Montreal, QC",Hybrid,About the job\r\nAbout Us\r\nWe’re a fast-grow...


In [6]:
model = "llama3.2" # "deepseek-r1:1.5b"
system_prompt = "You are a data scient's assitant. Follow the instruction cafrefully"
user_prompt = "Extract structured data from the following job description: "

for i, row in df.iterrows():
    description = row['Description']
    user_prompt += description
    response = generate_responses(system_prompt, user_prompt, model)
    print(response)
    break

{
  "title": "Gen AI Engineer",
  "company": "",
  "city": "",
  "work_mode": "",
  "education_level": [
    ""
  ],
  "min_experience_years": 4,
  "skills": [
    "LLMs (Large Language Models)",
    "Generative AI models",
    "LangChain",
    "Google Vertex AI",
    "Python programming",
    "Data engineering principles"
  ],
  "nice_to_have": [
    "Agentic AI",
    "GenAI frameworks"
  ],
  "specializations": [
    ""
  ],
  "tools_and_frameworks": [
    "LLMs (GPT, BERT)",
    "LangChain",
    "Google Vertex AI",
    "open-source frameworks"
  ]
  ,
"responsibilities_summary": [
    "Design and develop AI solutions using LLMs and Generative AI models.",
    "Build and manage pipelines for training, fine-tuning, and deploying models.",
    "Integrate AI models into production workflows and enterprise systems."
]
,
"description": ""
}


In [7]:
job_info = JobInfo.model_validate_json(response)
print(job_info)

title='Gen AI Engineer' company='' city='' work_mode='' education_level=[''] min_experience_years=4 skills=['LLMs (Large Language Models)', 'Generative AI models', 'LangChain', 'Google Vertex AI', 'Python programming', 'Data engineering principles'] nice_to_have=['Agentic AI', 'GenAI frameworks'] specializations=[''] tools_and_frameworks=['LLMs (GPT, BERT)', 'LangChain', 'Google Vertex AI', 'open-source frameworks'] responsibilities_summary=['Design and develop AI solutions using LLMs and Generative AI models.', 'Build and manage pipelines for training, fine-tuning, and deploying models.', 'Integrate AI models into production workflows and enterprise systems.'] description=''


Now that the tests have been passed, we can confidently start the analysis on the whole dataset.

### Run the analysis

In [8]:
def flatten_jobinfo(job: JobInfo) -> dict:
    """
    Flatten the data so data items with complex structures like list will be
    converted to several items.

    Say we have this dict: {'title': 'Data Scientist', 'skills': ["Python", "LLMs"]}
    the dict will be turned into: {'title': 'Data Scientist', "skill1": "Python", "skill_2": "LLM"}.
    We do this to make the conversion to a DataFrame and the analysis easier.
    """

    # Drop elements that contain a list
    base = {k: v for k, v in job.model_dump().items() if not isinstance(v, list)}

    # Turn a list of elements into several elements.
    # For example "skills": ["Python", "LLMs"] will be "skill1": "Python", "skill_2": "LLM"
    # This way we will have a column for each skill
    def expand_list(prefix, items):
        return {f"{prefix}_{i+1}": item for i, item in enumerate(items)}

    # Handle element that contain a list
    base.update(expand_list("skill", job.skills))
    base.update(expand_list("nice_to_have", job.nice_to_have))
    base.update(expand_list("education_level", job.education_level))
    base.update(expand_list("specialization", job.specializations))
    base.update(expand_list("tool", job.tools_and_frameworks))
    base.update(expand_list("responsibility", job.responsibilities_summary))

    return base

In [9]:
filenames = listdir(RAW_FOLDER)
filenames

['ai developer.csv',
 'ai engineer.csv',
 'data scientist.csv',
 'generative ai engineer.csv',
 'llm engineer.csv',
 'machine learning engineer.csv',
 'mlops engineer.csv']

In [11]:
def extract_job_info_and_save(filename):
    model = "llama3.2" # "deepseek-r1:1.5b"
    system_prompt = "You are a data scient's assitant. Follow the instruction cafrefully"
    user_prompt = "Extract structured data from the following job description: "
    job_info_list = []

    for i, row in df.iterrows():
        description = row['Description']
        user_prompt += description
        response = generate_responses(system_prompt, user_prompt, model)

        job_info = JobInfo.model_validate_json(response)
        flat_job_info = flatten_jobinfo(job_info)
        job_info_list.append(flat_job_info)

    job_info_df = pd.DataFrame(job_info_list)
    data = pd.concat([job_info_df, df], axis=1)

    save_to_csv(
        data=data, 
        folder=FINAL_FOLDER, 
        filename=filename[:-4]
    )


for file in filenames[1:]:
    extract_job_info_and_save(file)
