In [1]:
import os

from sqlalchemy import Table, Column, Integer, String, ForeignKey, MetaData, create_engine, text, inspect
from IPython.display import Markdown, display
from dotenv import load_dotenv
import pandas as pd
import tqdm
import numpy as np

In [2]:
load_dotenv()  # take environment variables from .env.
host="localhost"
database=os.getenv("POSTGRES_DB")
user=os.getenv("POSTGRES_USER")
password=os.getenv("POSTGRES_PASSWORD")
port=os.getenv("POSTGRES_PORT")

db_url = 'postgresql+psycopg2://{user}:{password}@{hostname}:{port}/{database_name}'.format(hostname=host, user=user, password=password, database_name=database, port=5432)

In [5]:
from pathlib import Path

path_raw_data_dir = Path("../../Volumes/data/raw")

files = list(path_raw_data_dir.glob("*.csv"))

In [6]:
# Check all expected 4 files are there
expected_files = {"caracteristiques", "lieux", "usagers", "vehicules"}
for exp_file in expected_files:
    if any(exp_file in f.name for f in files):
        continue
    print(f"Error: could not find file '{exp_file}' in files.")

In [7]:
# check year is the same
fs = [f.name.split(".csv")[0] for f in files]
years = {f.split("-")[-1] for f in fs}
if len(years) > 1:
    print("Error: More than 1 year found in the filenames: {years}")
year = years.pop()
year

'2021'

In [8]:
caracteristiques_file = None
lieux_file = None
usagers_file = None
vehicules_file = None

for file in files:
    if "caracteristiques" in file.name:
        caracteristiques_file = file
    if "lieux" in file.name:
        lieux_file = file
    if "usagers" in file.name:
        usagers_file = file
    if "vehicules" in file.name:
        vehicules_file = file

print(caracteristiques_file, lieux_file, usagers_file, vehicules_file)

../../Volumes/data/raw/caracteristiques-2021.csv ../../Volumes/data/raw/lieux-2021.csv ../../Volumes/data/raw/usagers-2021.csv ../../Volumes/data/raw/vehicules-2021.csv


In [9]:
YEAR = year

#--Importing dataset
df_users = pd.read_csv(usagers_file, sep=";").replace({np.NaN: None})
df_caract = pd.read_csv(caracteristiques_file, sep=";", header=0, low_memory=False).replace({np.NaN: None})
df_places = pd.read_csv(lieux_file, sep = ";", encoding='utf-8')
df_veh = pd.read_csv(vehicules_file, sep=";")

In [11]:
df_users.head()

Unnamed: 0,Num_Acc,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202100000001,201Â 764,B01,1,1,3,1,2000.0,1,0,9,-1,0,0,-1
1,202100000001,201Â 765,A01,1,1,1,1,1978.0,1,1,-1,-1,0,0,-1
2,202100000002,201Â 762,A01,1,1,4,1,1983.0,0,1,-1,-1,0,0,-1
3,202100000002,201Â 763,B01,1,1,3,1,1993.0,0,1,-1,-1,0,0,-1
4,202100000003,201Â 761,A01,1,1,1,1,1995.0,1,1,0,-1,0,0,-1


In [12]:
from src.data.db.models import Caracteristiques, Lieux, Vehicules, Users

In [13]:
# compare table data with dataframes
from sqlmodel import select, Session

engine = create_engine(db_url)

with Session(engine) as session:
    statement = select(Caracteristiques).where(Caracteristiques.year==YEAR)
    caracteristiques_from_db = session.exec(statement)

    df= pd.DataFrame([r.model_dump(exclude="year") for r in caracteristiques_from_db.fetchall()])

df.head()


Unnamed: 0,jour,an,mois,hrmn,lum,com,int,col,lat,Num_Acc,dep,agg,atm,adr,long
0,30,2021,11,07:32,2,30319,1,1,440389580000,202100000001,30,1,1,CD 981,43480220000
1,25,2021,9,14:20,1,51544,3,3,492421290000,202100000002,51,1,1,Aire de repos croisement D20E9 aprÃ¨s sortie D977,45545460000
2,15,2021,7,07:55,1,85048,1,6,469219500000,202100000003,85,2,7,15 rue FranÃ§ois Nicolas,-9644600000
3,27,2021,3,19:45,5,93005,2,6,489493634583,202100000004,93,2,3,Route de Mitry,25196639908
4,25,2021,2,07:20,5,76429,1,2,494083800000,202100000005,76,2,1,PARIS. ROUTE DE,11458100000


In [14]:
from sqlalchemy import create_engine

raw_sql_query = "SELECT * FROM {table} WHERE year = {year}"

cnx = create_engine(db_url).connect()
df_caract = pd.read_sql_query(raw_sql_query.format(table=Caracteristiques.__tablename__,year=YEAR), con=cnx)
df_places= pd.read_sql_query(raw_sql_query.format(table=Lieux.__tablename__,year=YEAR), con=cnx)
df_users= pd.read_sql_query(raw_sql_query.format(table=Users.__tablename__,year=YEAR), con=cnx)
df_veh= pd.read_sql_query(raw_sql_query.format(table=Vehicules.__tablename__,year=YEAR), con=cnx)

In [15]:
from sqlmodel import select

with Session(engine) as session:
    statement = select(Caracteristiques).where(Caracteristiques.year==YEAR)
    caracteristiques_from_db = session.exec(statement)
    df_caract= pd.DataFrame([r.model_dump(exclude="year") for r in caracteristiques_from_db.fetchall()])


In [16]:
from sqlmodel import select

with Session(engine) as session:
    statement = select(Users, Caracteristiques).where(Users.Num_Acc == Caracteristiques.Num_Acc).limit(10)
    results = session.exec(statement)
    for user, carac in results:
        print("User:", user)
        print("Caracteristiques:", carac)

User: num_veh='B01' id=1 year=2021 catu=1 sexe=1 trajet=1 secu2=9 locp=0 actp='0' Num_Acc=202100000001 id_vehicule='201Â\xa0764' place=1 grav=3 an_nais=2000.0 secu1=0 secu3=-1 etatp='-1'
Caracteristiques: jour=30 an=2021 mois=11 hrmn='07:32' lum=2 com='30319' int=1 col=1 lat='44,0389580000' year=2021 Num_Acc=202100000001 dep='30' agg=1 atm=1 adr='CD 981' long='4,3480220000'
User: num_veh='A01' id=2 year=2021 catu=1 sexe=1 trajet=1 secu2=-1 locp=0 actp='0' Num_Acc=202100000001 id_vehicule='201Â\xa0765' place=1 grav=1 an_nais=1978.0 secu1=1 secu3=-1 etatp='-1'
Caracteristiques: jour=30 an=2021 mois=11 hrmn='07:32' lum=2 com='30319' int=1 col=1 lat='44,0389580000' year=2021 Num_Acc=202100000001 dep='30' agg=1 atm=1 adr='CD 981' long='4,3480220000'
User: num_veh='A01' id=3 year=2021 catu=1 sexe=1 trajet=0 secu2=-1 locp=0 actp='0' Num_Acc=202100000002 id_vehicule='201Â\xa0762' place=1 grav=4 an_nais=1983.0 secu1=1 secu3=-1 etatp='-1'
Caracteristiques: jour=25 an=2021 mois=9 hrmn='14:20' lum