In [1]:
import psycopg2
import os 
import pandas as pd

In [2]:
data_path = "../../data"

In [3]:
from itertools import combinations


def find_primary_key_columns(df):
    """
    Find sets of columns that can serve as a primary key.

    Parameters:
    - df: The DataFrame to process.

    Returns:
    - A list of lists, where each inner list is a set of columns that can serve as a primary key.
    """
    # Get all column names
    columns = df.columns.tolist()

    # List to hold valid primary key column sets
    primary_key_sets = []

    # Check all non-empty subsets of columns
    for i in range(1, len(columns) + 1):
        for subset in combinations(columns, i):
            subset = list(subset)
            if (
                df.duplicated(subset).sum() == 0
            ):  # Check if all values are unique for this subset
                primary_key_sets.append(subset)
        if primary_key_sets != []:
            return primary_key_sets

    return primary_key_sets

In [7]:
file = os.path.join(data_path, "bilan-social-d-edf-sa-autres-conditions-de-travail.csv")
df = pd.read_csv(file, sep=";")
A = find_primary_key_columns(df)
for a in A:
    print(a)

['Année', 'Indicateur', 'Type de contrat', 'Tranche horaire', 'Collège', 'Genre']
['Année', 'Indicateur', 'Type de contrat', 'Tranche horaire', 'Collège', 'Gender']
['Année', 'Indicateur', 'Type de contrat', 'Tranche horaire', 'Employee category', 'Genre']
['Année', 'Indicateur', 'Type de contrat', 'Tranche horaire', 'Employee category', 'Gender']
['Année', 'Indicateur', 'Type de contrat', 'Time range', 'Collège', 'Genre']
['Année', 'Indicateur', 'Type de contrat', 'Time range', 'Collège', 'Gender']
['Année', 'Indicateur', 'Type de contrat', 'Time range', 'Employee category', 'Genre']
['Année', 'Indicateur', 'Type de contrat', 'Time range', 'Employee category', 'Gender']
['Année', 'Indicateur', 'Type of contract', 'Tranche horaire', 'Collège', 'Genre']
['Année', 'Indicateur', 'Type of contract', 'Tranche horaire', 'Collège', 'Gender']
['Année', 'Indicateur', 'Type of contract', 'Tranche horaire', 'Employee category', 'Genre']
['Année', 'Indicateur', 'Type of contract', 'Tranche horaire

In [4]:
file = os.path.join(data_path, "bilan-social-d-edf-sa-salaries-en-situation-de-handicap.csv")
df = pd.read_csv(file, sep=";")
A = find_primary_key_columns(df)
for a in A:
    print(a)

['Année', 'Type de contrat', 'Collège', 'Genre']
['Année', 'Type de contrat', 'Collège', 'Gender']
['Année', 'Type de contrat', 'Employee category', 'Genre']
['Année', 'Type de contrat', 'Employee category', 'Gender']
['Année', 'Type of contract', 'Collège', 'Genre']
['Année', 'Type of contract', 'Collège', 'Gender']
['Année', 'Type of contract', 'Employee category', 'Genre']
['Année', 'Type of contract', 'Employee category', 'Gender']


In [5]:
file = os.path.join(data_path, "bilan-social-d-edf-sa-absenteisme.csv")
df2 = pd.read_csv(file, sep=";")
A = find_primary_key_columns(df2[df2.columns[~df2.columns.isin(["Indicateur", "Indicator"])]])
for a in A:
    print(a)

In [None]:
file = os.path.join(data_path, "bilan-social-d-edf-sa-effectifs-et-repartition-par-age-statut-et-sexe.csv")
df2 = pd.read_csv(file, sep=";")
A = find_primary_key_columns(df2[df2.columns[~df2.columns.isin(["Indicateur", "Indicator"])]])
for a in A:
    print(a)

KeyboardInterrupt: 

In [None]:
df2

['../../data/bilan-social-d-edf-sa-salaries-en-situation-de-handicap.csv',
 '../../data/bilan-social-d-edf-sa-autres-conditions-de-travail.csv',
 '../../data/bilan-social-d-edf-sa-droit-du-travail.csv',
 '../../data/bilan-social-d-edf-sa-formation.csv',
 '../../data/bilan-social-d-edf-sa-travailleurs-exterieurs.csv',
 '../../data/bilan-social-d-edf-sa-remuneration-et-promotions.csv',
 '../../data/README.md',
 '../../data/bilan-social-d-edf-sa-absenteisme.csv',
 '../../data/bilan-social-d-edf-sa-effectifs-et-repartition-par-age-statut-et-sexe.csv']

In [18]:
df

Unnamed: 0,Année,Perimètre juridique,Perimètre spatial,Spatial perimeter,Indicateur,Indicator,Type de contrat,Type of contract,Collège,Employee category,Genre,Gender,Valeur,Unité,Unit,Chapitre du bilan social
0,2023,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Statutaires,Statutory,Cadre,Managers,Femme,Female,343.0,nombre,number,§1.7.1
1,2023,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDI,Permanent non-staturory,Maîtrise,Supervisors,Homme,Male,0.0,nombre,number,§1.7.1
2,2023,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDD,Fixed-term non-statutory,Cadre,Managers,Homme,Male,0.0,nombre,number,§1.7.1
3,2023,EDF SA,France,France,Salariés reconnus travailleurs handicapés suit...,Employees Recognized as Disabled Workers Due t...,Statutaires,Statutory,Maîtrise,Supervisors,,,49.0,nombre,number,§1.7.2
4,2023,EDF SA,France,France,Salariés reconnus travailleurs handicapés suit...,Employees Recognized as Disabled Workers Due t...,Non Statutaires CDD,Fixed-term non-statutory,Cadre,Managers,,,0.0,nombre,number,§1.7.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,2017,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDI,Permanent non-staturory,Exécution,Operatives,Femme,Female,1.0,nombre,number,§1.7.1
185,2017,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDD,Fixed-term non-statutory,Cadre,Managers,Homme,Male,0.0,nombre,number,§1.7.1
186,2017,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDD,Fixed-term non-statutory,Cadre,Managers,Femme,Female,0.0,nombre,number,§1.7.1
187,2017,EDF SA,France,France,Salariés en situation de handicap,Employees with Disabilities,Non Statutaires CDD,Fixed-term non-statutory,Maîtrise,Supervisors,Femme,Female,10.0,nombre,number,§1.7.1


In [None]:
pdf_path = "../../data/bilan-social.pdf"

In [None]:
import pypdf_table_extraction
tables = pypdf_table_extraction.read_pdf(pdf_path)
tables

tables.export('foo.csv', f='csv', compress=True) # json, excel, html, markdown, sqlite

tables[0].parsing_report
{
    'accuracy': 99.02,
    'whitespace': 12.24,
    'order': 1,
    'page': 1
}
tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_markdown, to_sqlite
tables[0].df # get a pandas DataFrame!


ModuleNotFoundError: No module named 'cv2'