# Building a Catalog

In [None]:
from classes.source_catalog import GetCnilCatalog

url = 'https://www.data.gouv.fr/api/1/organizations/534fff61a3a7292c64a77d59/catalog'
headers = {'accept': 'application/json'}
url_add = 'https://www.data.gouv.fr/fr/organizations/cnil/datasets.csv'
instance1 = GetCnilCatalog(url, headers, url_add)
data = instance1.fetch_data_from_api()
data = data['@graph']
table_name = 'title'
download_url = 'downloadURL'
table_id = 'identifier' 
file_format= 'format'
last_update= 'modified' 
accessURL = '@id'
df_catalog = instance1.response_to_dataframe(data=data, table_name=table_name, download_url=download_url, table_id=table_id, file_format=file_format, last_update=last_update, accessURL=accessURL)
df_dataset = instance1.load_additional_info()
df_catalog = instance1.identify_datasets_info()
df_catalog = instance1.merge_additional_info()
instance1.save_to_csv('source_cnil_catalog')

# Uploading Files to GCS

In [None]:
from classes.file_to_gcs import FromFileToGCS
import os

bucket_name = 'cnil_csv'
cred_path = 'cred/service_account_local_py.json'
init2 = FromFileToGCS(bucket_name, cred_path)
init2.create_bucket()
file_paths = ['data/catalog/source_cnil_catalog_2024-02-10.csv','data/catalog/opencnil-violationsdcpnotifiees-20230930.csv']
dest_folder = 'raw'
dest_blob = ['source_cnil_catalog_2024-02-10.csv', 'opencnil-violationsdcpnotifiees-20230930.csv']
init2.local_to_gcs(file_paths, dest_folder, dest_blob)

In [None]:
bucket_name = 'cnil_csv'
cred_path = 'cred/service_account_local_py.json'
init2 = FromFileToGCS(bucket_name, cred_path)
init2.create_bucket()
url = ['https://www.data.gouv.fr/fr/organizations/cnil/datasets.csv', 'https://www.data.gouv.fr/fr/datasets/r/0f678674-4327-4c4d-8819-b6f508b41d0e']
dest_folder = 'raw'
dest_blob = ['datasets.csv', 'plaintes.csv']
init2.download_and_upload_from_URLs(url, dest_folder, dest_blob)

# Downloading from Catalog

In [None]:
from classes.download_catalog_content import DlCatalogContent

instance3 = DlCatalogContent('data/catalog/source_cnil_catalog_2024-02-10.csv')
instance3.get_tables()
instance3.zip_files()

# Prep data to upload to BQ

In [None]:
from classes.file_to_gcs import FromFileToGCS
import os

bucket_name = 'cnil_csv'
cred_path = 'cred/service_account_local_py.json'
init2 = FromFileToGCS(bucket_name, cred_path)
init2.create_bucket()
file_paths = ['data/catalog/source_cnil_catalog_2024-02-10.csv','data/raw_datasets.zip']
dest_folder = 'raw'
init2.local_to_gcs(file_paths, dest_folder)

In [None]:
from classes.prep_data import ZipFileProcessor

gcs_bucket_name = 'cnil_csv'
credential_path = 'cred/service_account_local_py.json'
zip_blob_name = '2024-02-12/raw/raw_datasets.zip'
output_folder_name = '2024-02-12/'+ 'prep'
instance4 = ZipFileProcessor(gcs_bucket_name, credential_path, zip_blob_name, output_folder_name)
instance4.process_zip_file()

In [None]:
import pandas as pd

df = pd.read_excel('data/raw_datasets/controles-realises-par-la-cnil/Liste des contrôles réalisés par la CNIL en 2014_v2015-06-15')
df

In [None]:
from classes.list_files import FolderLister

instance6 = FolderLister('data/raw_datasets')
# instance6.list_folders()
paths = instance6.list_rel_paths()
print(paths)

In [6]:
import pandas as pd
from colorama import Fore, Style
from classes.list_files import FolderLister
from pandas.errors import *

instance6 = FolderLister('data/raw_datasets')
paths = instance6.list_rel_paths()
print(len(paths))
print(paths)


def verify_error_onbadlines(path):
    with open(path) as f:
        len_csv = sum(1 for line in f)

    print(len_csv)
    print(df.shape)
    number_of_skipped_rows = len_csv - df.shape[0]
    print('number_of_skipped_rows:' , number_of_skipped_rows)
    errors_imp = number_of_skipped_rows/len_csv * 100
    if errors_imp > 1:
        print('More than 1 percent of rows skipped, file is not good')
        return None
    else:
        print('More less 1 percent of rows skipped, file is okay')
        return df

def correct_shape(df):
    try:
        if df.shape[1] == 1:
            print('columns shape is 1, csv read with ;')
            df = pd.read_csv(path, sep=';')
            if df.shape[1] == 1:
                print('try to find headers in 2nd row')
                df = pd.read_csv(path, sep=';', skiprows=1)
                return df
        elif 'unnamed' in str(df.columns[1]).lower():
            print('try to find headers in 2nd row')
            df.columns = df.iloc[0]
            df = df[1:]
            return df
        else:
            return df
        
    except ParserError as e:
        print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
        print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}, return None")
        print('cant correct shape')
        df = None
        return df
    except Exception as e:
        print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
        print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}, return None")

def transpose(df):
    if df.shape[1] > df.shape[0]:
        df = df.transpose()
        df.columns = df.iloc[0]
        df = df[1:]  
        return df
    else:
        print('More rows than columns, no need to transpose')
        return df


path = paths[0]
print(path)
if path.endswith('.csv'):
    print('.csv found')
    try:
        df = pd.read_csv(path)
    except ParserError as e:
        print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
        print(e)
        print('trying to open csv with sep = ";"')
        df = pd.read_csv(path, sep=';')
    print(df.shape)
    df = correct_shape(df)
elif path.endswith('.xlsx'):
    print('.xsxl found')
    df = pd.read_excel(path)
    print(df.shape)
    df = correct_shape(df)
elif "." not in path:
    try:
        try: 
            print('try to read as csv')
            df = pd.read_csv(path)
            print(df.shape)
            df = correct_shape(df)
        except ParserError as e:
            print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
            print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}")
            print('try to read as excel')
            df = pd.read_excel(path)
            print(df.shape)
            df = correct_shape(df)
        except UnicodeError as e:
            print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
            print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}")
            print('try to read as excel')
            df = pd.read_excel(path)
            print(df.shape)
            df = correct_shape(df)
        except Exception as e:
            print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
            print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}")
            print('try to read as excel')
            df = pd.read_csv(path, sep=';')
            print(df.shape)
            df = correct_shape(df)
    except Exception as e:
        print(f"{Fore.RED}Exception type: {type(e).__name__}{Style.RESET_ALL}")
        print(f"{Fore.RED}Exception: {e}{Style.RESET_ALL}")
        print('cant read as df')
        df = None

if df is not None:
    print(df.shape)
    if df.shape[1] > df.shape[0]:
        print(' More columns than rows, need to transpose')
        df = df.transpose()
        df.columns = df.iloc[0]
        df = df[1:] 
        print(df.shape)
        print('Transposed')
    elif df.shape[1] <= df.shape[0] and df.columns[0].lower() == 'année' and str(df.columns[1]).isdigit() and len(str(df.columns[1])) == 4:
        print('Values for years in first column, need to transpose')
        df = df.transpose()
        df.columns = df.iloc[0]
        df = df[1:] 
        print(df.shape)
        print('Transposed')

    


file_name = path.split('/')[-1]
if df is not None:
    df= df.dropna(axis=1, how='all')
    df.head()

df
# df.to_csv(f'data/test/{file_name}.csv', index=False, sep=';')
# pd.read_csv(f'data/test/{file_name}.csv', sep=';')
# df.head()

# df.head()

45
['data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/traitements-de-donnees-personnelles-declares-a-la-cnil-depuis-le-25-mai-2018/Formalités préalables reçues par la CNIL depuis le 25 mai 2018_v2024-02-05', 'data/raw_datasets/marches-publics-de-la-cnil/opencnil-marches-publics-2014-2020_v2021-06-02.xlsx', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.csv', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.xlsx', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/les-deliberations-de-la-cnil/DTD CNIL _v2023-11

In [1]:
path = 'data/raw_datasets/sanctions-prononcees-par-la-cnil/open-cnil-ventilation-sanctions-depuis-2014-vd_v2019-05-14.xlsx'
df = pd.read_excel(path)
print(df.shape)
df

NameError: name 'pd' is not defined

In [3]:
from classes.list_files import FolderLister
from pandas.errors import *
instance6 = FolderLister('data/raw_datasets')
paths = instance6.list_rel_paths()
print(len(paths))
print(paths)

45
['data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/traitements-de-donnees-personnelles-declares-a-la-cnil-depuis-le-25-mai-2018/Formalités préalables reçues par la CNIL depuis le 25 mai 2018_v2024-02-05', 'data/raw_datasets/marches-publics-de-la-cnil/opencnil-marches-publics-2014-2020_v2021-06-02.xlsx', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.csv', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.xlsx', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/les-deliberations-de-la-cnil/DTD CNIL _v2023-11

In [4]:
from classes.prep_data import PrepFilesBQ

paths2 = paths

print(paths2)
instance5 = PrepFilesBQ(paths2)
instance5.process_all_files()

['data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/budget-de-la-cnil-1/opencnil-budget-depuis-2000-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/traitements-de-donnees-personnelles-declares-a-la-cnil-depuis-le-25-mai-2018/Formalités préalables reçues par la CNIL depuis le 25 mai 2018_v2024-02-05', 'data/raw_datasets/marches-publics-de-la-cnil/opencnil-marches-publics-2014-2020_v2021-06-02.xlsx', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.csv', 'data/raw_datasets/mises-en-demeure-prononcees-par-la-cnil/open-cnil-volumes-med-depuis-2014-maj-aout-2023_v2023-08-25.xlsx', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.csv', 'data/raw_datasets/effectifs-de-la-cnil/opencnil-effectifs-depuis-1980-maj-juin-2023_v2023-06-28.xlsx', 'data/raw_datasets/les-deliberations-de-la-cnil/DTD CNIL _v2023-11-17

  warn("""Cannot parse header or footer so it will be ignored""")


(496, 6)
(496, 6)
opened df, return from open_df
this is df
More rows than columns, no need to transpose
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
Re-exécution terminée.
[32mdata/raw_datasets/controles-realises-par-la-cnil/Liste des contrôles réalisés par la CNIL en 2015_v2016-05-03 processed successfully![0m
---------------------------------------------------
---------------------------------------------------
[32mdata/raw_datasets/controles-realises-par-la-cnil/opencnil-liste-controles-2019_v2020-11-13.xlsx[0m
.xsxl found
(303, 6)
opened df, return from open_df
this is df
More rows than columns, no need to transpose
<class 'str'>
<class 'str'>
<class 'str'>
<class 'st

  warn("""Cannot parse header or footer so it will be ignored""")


[32mdata/raw_datasets/controles-realises-par-la-cnil/Liste des contrôles réalisés par la CNIL en 2016_v2017-03-30 processed successfully![0m
---------------------------------------------------
---------------------------------------------------
[32mdata/raw_datasets/controles-realises-par-la-cnil/open-data-controles-2020-vd-20210603_v2021-06-03.xlsx[0m
.xsxl found
(247, 6)
opened df, return from open_df
this is df
More rows than columns, no need to transpose
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
Re-exécution terminée.
[32mdata/raw_datasets/controles-realises-par-la-cnil/open-data-controles-2020-vd-20210603_v2021-06-03.xlsx processed successfully![0m
--------------

  warn("Workbook contains no default style, apply openpyxl's default")


(96524, 26)
try to find headers in 2nd row
opened df, return from open_df
this is df
More rows than columns, no need to transpose
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.
The column_formatter method worked perfectly.


In [1]:
class ClasseDeBase:
    def ma_fonction(self):
        print("Fonction de la classe de base")

class ClasseDerivee(ClasseDeBase):
    def ma_fonction(self):
        super().ma_fonction()
        print("Nouvelle implémentation dans la classe dérivée")

# Utilisation de la classe dérivée
objet_derive = ClasseDerivee()
objet_derive.ma_fonction()

Nouvelle implémentation dans la classe dérivée
