In [1]:
#*** INITIALISATION ***#

import os
from pathlib import Path
from dotenv import load_dotenv
import datetime

load_dotenv()


# Création d'un dossier temporaire vide

tmp_dir: Path=Path(os.getenv('TMP_DIR'))

if os.path.isdir(tmp_dir):
    os.removedirs(tmp_dir)

os.mkdir(tmp_dir)


# Timestamp
date_now = datetime.date.today().isoformat()
datetime_now = datetime.datetime.now().isoformat()

In [2]:
#*** TABLEAU DE BORD DASK ***#

# from dask.distributed import Client

# client = Client(n_workers=4, threads_per_worker=2, memory_limit="5GB")
# client
#
# "Dask needs bokeh!=3.0.*,>=2.4.2 for the dashboard.", mais bokeh 3.4.0 est installé. Bizarre.

In [3]:
#*** TÉLÉCHARGEMENT ***#

import numpy
import dask.dataframe as dd
import pandas as pd


decp_enrichies_file: Path = Path(f'data/decp_enrichies_{date_now}.parquet')


if not(os.path.exists(decp_enrichies_file)):
    df_decp: pd.DataFrame = pd.read_csv(os.getenv('DECP_ENRICHIES_URL'), sep=";", dtype='object')
    df_decp.replace([numpy.Nan, None], "", inplace=True, regex=False)
    df_decp.to_parquet(decp_enrichies_file)
else:
    df_decp: pd.DataFrame = pd.read_parquet(decp_enrichies_file)
    print("DECP d'aujourd'hui déjà téléchargées")


DECP d'aujourd'hui déjà téléchargées


In [4]:
#*** ANALYSE DE BASE ***#

# df_decp.info(verbose=True)

In [5]:
#*** REDRESSEMENT ***#

columns_date = ["datePublicationDonnees", "dateNotification"]

date_replacements = {
    "2921": "2021",
    "0002-11-30": "",
    "September,": "",
    "2980-11-22": "",
    "29\-02\-2021": "",
    "0222-04-29": "", # 
    "2921-11-19": "",
    "2920-06-01": "", # 2021.01LOT600 21590309700016
    "0021-12-05": "",
    "0001-06-21": "",
    "5021-02-18": "",
    "0019-10-18": "",
    "/": "-"
}


for col in columns_date:
    df_decp[col] = df_decp[col].replace(date_replacements, regex=True)

In [9]:

# df_decp["datePublicationDonnees"].loc[df_decp["datePublicationDonnees"] == '2921-11-19']
# df_decp.loc[301707]


id                                                                      20222022/1400
source                                                                    PES Marchés
natureObjetMarche                                                             Travaux
objetMarche                         Electricite courants forts et faibles alarme i...
codeCPV                                                                    45454000-4
codeCPV_division                                                                   45
referenceCPV                                               Travaux de restructuration
dateNotification                                                  2022-04-08 00:00:00
anneeNotification                                                                2022
datePublicationDonnees                                                            NaT
dureeMois                                                                          16
montant                                               

In [7]:
#***  TYPES DE DONNÉES ***#

numeric_dtypes = {
    "dureeMois": "int64",
    "montant": "float64"
}

for column in numeric_dtypes.keys():
    df_decp[column] = pd.to_numeric(df_decp[column])


date_dtypes = {
    "datePublicationDonnees": "date",
    "dateNotification": "date"
}

for column in date_dtypes.keys():
    df_decp[column] = pd.to_datetime(df_decp[column], format='mixed', dayfirst=True)


df_decp.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994123 entries, 0 to 994122
Data columns (total 40 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   id                                994123 non-null  object        
 1   source                            993724 non-null  object        
 2   natureObjetMarche                 994123 non-null  object        
 3   objetMarche                       994123 non-null  object        
 4   codeCPV                           977345 non-null  object        
 5   codeCPV_division                  994123 non-null  object        
 6   referenceCPV                      977345 non-null  object        
 7   dateNotification                  989346 non-null  datetime64[ns]
 8   anneeNotification                 989346 non-null  object        
 9   datePublicationDonnees            987186 non-null  datetime64[ns]
 10  dureeMois                       