# Construct pydantic model from text input 

In [1]:
from pydantic_ai import Agent 

agent = Agent(model="google-gla:gemini-2.5-flash")

result = await agent.run("Give me an employee working in sweden, keep it short")
result

AgentRunResult(output='Lars Olsson')

In [2]:
print(result.output)

Lars Olsson


In [3]:
from pydantic import BaseModel, Field

class EmployeeModel(BaseModel):
    name: str
    age: int
    salary: int = Field(gt=30_000, lt=50_000)
    position: str

result = await agent.run("Give me an IT employee working in sweden", output_type=EmployeeModel)

result

AgentRunResult(output=EmployeeModel(name='Bjorn', age=32, salary=45000, position='IT Consultant'))

In [4]:
employee = result.output
employee

EmployeeModel(name='Bjorn', age=32, salary=45000, position='IT Consultant')

In [5]:
employee.name

'Bjorn'

In [6]:
employee.age

32

In [7]:
employee.salary

45000

In [8]:
employee.model_dump()

{'name': 'Bjorn', 'age': 32, 'salary': 45000, 'position': 'IT Consultant'}

In [9]:
print(employee.model_dump_json(indent=2))

{
  "name": "Bjorn",
  "age": 32,
  "salary": 45000,
  "position": "IT Consultant"
}


# Several employees or a list of employees

In [10]:
from typing import List
result = await agent.run("""Give me ten employees in AI and data engineering fields, roles can vary, but salary must be between 30000 and 50000""", output_type=List[EmployeeModel])

employee = result.output
employee

[EmployeeModel(name='Alice Smith', age=28, salary=45000, position='AI Engineer'),
 EmployeeModel(name='Bob Johnson', age=32, salary=48000, position='Data Scientist'),
 EmployeeModel(name='Carol White', age=29, salary=42000, position='Machine Learning Engineer'),
 EmployeeModel(name='David Green', age=35, salary=49999, position='Data Engineer'),
 EmployeeModel(name='Eve Black', age=27, salary=38000, position='AI Research Scientist'),
 EmployeeModel(name='Frank Brown', age=30, salary=41000, position='Business Intelligence Developer'),
 EmployeeModel(name='Grace Lee', age=33, salary=47000, position='Deep Learning Engineer'),
 EmployeeModel(name='Henry Wilson', age=31, salary=39000, position='NLP Engineer'),
 EmployeeModel(name='Ivy Davis', age=26, salary=36000, position='Data Analyst'),
 EmployeeModel(name='Jack Miller', age=34, salary=49000, position='Big Data Engineer')]

In [11]:
len(employee)

10

In [12]:
for employee in employee:
    print(f"{employee.name =} and {employee.salary =}")

employee.name ='Alice Smith' and employee.salary =45000
employee.name ='Bob Johnson' and employee.salary =48000
employee.name ='Carol White' and employee.salary =42000
employee.name ='David Green' and employee.salary =49999
employee.name ='Eve Black' and employee.salary =38000
employee.name ='Frank Brown' and employee.salary =41000
employee.name ='Grace Lee' and employee.salary =47000
employee.name ='Henry Wilson' and employee.salary =39000
employee.name ='Ivy Davis' and employee.salary =36000
employee.name ='Jack Miller' and employee.salary =49000


# CV or resume model

In [13]:

class ExperienceModel(BaseModel):
    title: str
    company: str
    description: str 
    start_year: int
    end_year: int

class EducationModel(BaseModel):
    title: str
    eduction_area: str
    school: str
    description: str 
    start_year: int
    end_year: int


class CvModel(BaseModel):
    name: str 
    age: int
    experiences: list[ExperienceModel]
    educations: list[EducationModel]


result = await agent.run("Create a swedish person applying for a data engineering position", output_type=CvModel
)

resume = result.output
resume


CvModel(name='Björn Borg', age=35, experiences=[ExperienceModel(title='Data Engineer', company='Spotify', description='Developed and maintained data pipelines using Apache Spark and Kafka.', start_year=2018, end_year=2023), ExperienceModel(title='Junior Data Engineer', company='H&M', description='Assisted in building and optimizing ETL processes.', start_year=2015, end_year=2018)], educations=[EducationModel(title='MSc in Computer Science', eduction_area='Data Engineering', school='KTH Royal Institute of Technology', description="Master's thesis on real-time data processing.", start_year=2013, end_year=2015), EducationModel(title='BSc in Software Engineering', eduction_area='Computer Science', school='Uppsala University', description='Focused on algorithms and data structures.', start_year=2010, end_year=2013)])

In [14]:
resume.name

'Björn Borg'

In [15]:
resume.age

35

In [16]:
resume.model_dump()

{'name': 'Björn Borg',
 'age': 35,
 'experiences': [{'title': 'Data Engineer',
   'company': 'Spotify',
   'description': 'Developed and maintained data pipelines using Apache Spark and Kafka.',
   'start_year': 2018,
   'end_year': 2023},
  {'title': 'Junior Data Engineer',
   'company': 'H&M',
   'description': 'Assisted in building and optimizing ETL processes.',
   'start_year': 2015,
   'end_year': 2018}],
 'educations': [{'title': 'MSc in Computer Science',
   'eduction_area': 'Data Engineering',
   'school': 'KTH Royal Institute of Technology',
   'description': "Master's thesis on real-time data processing.",
   'start_year': 2013,
   'end_year': 2015},
  {'title': 'BSc in Software Engineering',
   'eduction_area': 'Computer Science',
   'school': 'Uppsala University',
   'description': 'Focused on algorithms and data structures.',
   'start_year': 2010,
   'end_year': 2013}]}

## Optioal postproccessing -> load into duckdb and unnest

In [17]:
import dlt 

pipeline = dlt.pipeline(
    pipeline_name="resume_json_duckdb",
    destination=dlt.destinations.duckdb("cv.duckdb"),
    dataset_name="staging"
)

info = pipeline.run(data=[resume.model_dump()], loader_file_format="jsonl", table_name="cv_entries")
print(info)


Pipeline resume_json_duckdb load step completed in 0.03 seconds
1 load package(s) were loaded to destination duckdb and into dataset staging
The duckdb destination used duckdb:////Users/abdulrahmanfahmi/Documents/Github/Dataplattformar_maskininl-rning_artificiell-intelligens_DE24_Abdulrahman_fahmi/cv.duckdb location to store data
Load package 1765568481.5671868 is LOADED and contains no failed jobs


In [18]:
import duckdb 

with duckdb.connect("cv.duckdb") as conn:
    desc = conn.sql("desc").df()

desc

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,cv,staging,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,cv,staging,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,cv,staging,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,cv,staging,cv_entries,"[name, age, _dlt_load_id, _dlt_id]","[VARCHAR, BIGINT, VARCHAR, VARCHAR]",False
4,cv,staging,cv_entries__educations,"[title, eduction_area, school, description, st...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, BIGINT, B...",False
5,cv,staging,cv_entries__experiences,"[title, company, description, start_year, end_...","[VARCHAR, VARCHAR, VARCHAR, BIGINT, BIGINT, VA...",False


In [19]:
import duckdb 

with duckdb.connect("cv.duckdb") as conn:
    desc = conn.sql("desc").df()
    cv_entries = conn.sql("from staging.cv_entries").df()
    educations = conn.sql("from staging.cv_entries__educations").df()
    experiences = conn.sql("from staging.cv_entries__experiences").df()

desc

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,cv,staging,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,cv,staging,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,cv,staging,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,cv,staging,cv_entries,"[name, age, _dlt_load_id, _dlt_id]","[VARCHAR, BIGINT, VARCHAR, VARCHAR]",False
4,cv,staging,cv_entries__educations,"[title, eduction_area, school, description, st...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, BIGINT, B...",False
5,cv,staging,cv_entries__experiences,"[title, company, description, start_year, end_...","[VARCHAR, VARCHAR, VARCHAR, BIGINT, BIGINT, VA...",False


In [20]:
cv_entries

Unnamed: 0,name,age,_dlt_load_id,_dlt_id
0,Linnea Karlsson,30,1765566951.477592,T1PLcON9NEp6fQ
1,Björn Borg,35,1765568481.5671868,fGkLw9QkogRMJA


In [21]:
educations

Unnamed: 0,title,eduction_area,school,description,start_year,end_year,_dlt_parent_id,_dlt_list_idx,_dlt_id
0,Master of Science in Data Science,Data Science,KTH Royal Institute of Technology,"Specialized in big data technologies, machine ...",2016,2018,T1PLcON9NEp6fQ,0,ieVi67pgRpDqTg
1,MSc in Computer Science,Data Engineering,KTH Royal Institute of Technology,Master's thesis on real-time data processing.,2013,2015,fGkLw9QkogRMJA,0,1OemiwBW5STIbA
2,BSc in Software Engineering,Computer Science,Uppsala University,Focused on algorithms and data structures.,2010,2013,fGkLw9QkogRMJA,1,RVLqg9apHN5EzQ


In [22]:
experiences

Unnamed: 0,title,company,description,start_year,end_year,_dlt_parent_id,_dlt_list_idx,_dlt_id
0,Data Engineer,Tech Solutions AB,"Designed, built, and maintained scalable data ...",2020,2023,T1PLcON9NEp6fQ,0,yR6AYRxjrd490Q
1,Junior Data Developer,Nordic Innovations,Developed SQL scripts for data extraction and ...,2018,2020,T1PLcON9NEp6fQ,1,Qk6YpSif+m7TqA
2,Data Engineer,Spotify,Developed and maintained data pipelines using ...,2018,2023,fGkLw9QkogRMJA,0,KVi20J0INgtkXQ
3,Junior Data Engineer,H&M,Assisted in building and optimizing ETL proces...,2015,2018,fGkLw9QkogRMJA,1,1L7VOvn1wvKPzA


In [23]:
duckdb.sql("SHOW TABLES").df()

Unnamed: 0,name


In [24]:
duckdb.sql("SELECT * FROM information_schema.tables").df()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action,TABLE_COMMENT


In [31]:
duckdb.sql("""
    SELECT
        cv.name,
        cv.age,
        ex.company,
        ex.description AS experience_description,
        ex.start_year AS experience_start_year,
        ex.end_year AS experience_end_year,
        e.title,
        e.eduction_area,
        e.school,
        e.start_year AS education_start_year,
        e.end_year AS education_end_year
    FROM cv_entries cv
    LEFT JOIN educations e ON cv._dlt_id = e._dlt_parent_id
    LEFT JOIN experiences ex ON cv._dlt_id = ex._dlt_parent_id
""").df()

Unnamed: 0,name,age,company,experience_description,experience_start_year,experience_end_year,title,eduction_area,school,education_start_year,education_end_year
0,Linnea Karlsson,30,Tech Solutions AB,"Designed, built, and maintained scalable data ...",2020,2023,Master of Science in Data Science,Data Science,KTH Royal Institute of Technology,2016,2018
1,Linnea Karlsson,30,Nordic Innovations,Developed SQL scripts for data extraction and ...,2018,2020,Master of Science in Data Science,Data Science,KTH Royal Institute of Technology,2016,2018
2,Björn Borg,35,Spotify,Developed and maintained data pipelines using ...,2018,2023,BSc in Software Engineering,Computer Science,Uppsala University,2010,2013
3,Björn Borg,35,H&M,Assisted in building and optimizing ETL proces...,2015,2018,BSc in Software Engineering,Computer Science,Uppsala University,2010,2013
4,Björn Borg,35,Spotify,Developed and maintained data pipelines using ...,2018,2023,MSc in Computer Science,Data Engineering,KTH Royal Institute of Technology,2013,2015
5,Björn Borg,35,H&M,Assisted in building and optimizing ETL proces...,2015,2018,MSc in Computer Science,Data Engineering,KTH Royal Institute of Technology,2013,2015
