In [None]:

import pandas as pd
from sqlalchemy import create_engine, text

In [None]:

def get_dataCR():
    DB_HOST = "37.61.241.45"
    DB_PORT = "5432"
    DB_NAME = "Khubeo_IA"
    DB_USER = "postgres"
    DB_PASS = "Xjp2yCm$G36WR4E"
    
    engine = create_engine(f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
    
    with engine.connect() as conn:
        result = conn.execute(text("""select * from t_compte_rendu cr
        left join t_dates da
            on da.fulldatealternatekey = cr.cr_date
        left join t_dossier dos
            on dos.id_dossier = cr.id_dossier
        left join t_enseigne en
            on en.id_enseigne = dos.do_id_enseigne
    where cr.id_client not in  (15,31,100, 101) limit 500;"""))
        df = pd.DataFrame(result.fetchall(), columns=result.keys())  # Convert result to DataFrame
        return df
df = get_dataCR()

print("Loaded dataframe with {df.shape[0]} rows and {df.shape[1]} columns")

df.head()

# Print column names as a list
print(df.columns.tolist())


In [None]:
def compute_hebergement_service(row):
    """
    Process financial data for accommodation and services, calculating TTC, HT, and TVA values.
    Also calculates total values across accommodation and services.
    Returns a Series with all original columns and updated calculated values.
    """
    # Start with the original row (this avoids using copy)
    result = pd.Series(index=row.index, data=row.values)
    
    # Helper function to safely convert to float (returns original value if conversion fails)
    def safe_float(value):
        try:
            return float(value)
        except (TypeError, ValueError):
            return value
    
    # Helper function to calculate the three financial values (TTC, HT, TVA)
    def calculate_financial_values(ttc, ht, tva):
        if pd.isnull(ttc) or ttc == 0:
            ttc = ht * 1.10 if pd.notnull(ht) and ht != 0 else ttc
        
        if pd.isnull(ht) or ht == 0:
            ht = ttc / 1.10 if pd.notnull(ttc) and ttc != 0 else ht
        
        if pd.isnull(tva) or tva == 0:
            if pd.notnull(ht) and ht > 0:
                tva = ht * 0.10
            elif pd.notnull(ttc) and ttc != 0:
                tva = ttc - (ttc / 1.10)
        
        return ttc, ht, tva
    
    # Extract and convert accommodation values from the input row
    hebergement_ttc = safe_float(row['cr_hebergement_ttc'])
    hebergement_ht = safe_float(row['cr_hebergement_ht'])
    hebergement_tva = safe_float(row['cr_hebergement_tva'])
    
    # Calculate accommodation financial values
    ttc_hebergement, ht_hebergement, tva_hebergement = calculate_financial_values(
        hebergement_ttc, hebergement_ht, hebergement_tva
    )
    
    # Extract and convert service values
    service_ttc = safe_float(row['cr_service_ttc'])
    service_ht = safe_float(row['cr_service_ht'])
    service_tva = safe_float(row['cr_service_tva'])
    
    # Calculate service financial values
    ttc_service, ht_service, tva_service = calculate_financial_values(
        service_ttc, service_ht, service_tva
    )
    
    # Calculate totals
    total_ht = None
    if pd.notnull(ht_hebergement) or pd.notnull(ht_service):
        total_ht = (ht_hebergement if pd.notnull(ht_hebergement) else 0) + (ht_service if pd.notnull(ht_service) else 0)
    
    total_ttc = None
    if pd.notnull(ttc_hebergement) or pd.notnull(ttc_service):
        total_ttc = (ttc_hebergement if pd.notnull(ttc_hebergement) else 0) + (ttc_service if pd.notnull(ttc_service) else 0)
    
    total_tva = None
    if pd.notnull(tva_hebergement) or pd.notnull(tva_service):
        total_tva = (tva_hebergement if pd.notnull(tva_hebergement) else 0) + (tva_service if pd.notnull(tva_service) else 0)
    
    # Update the calculated columns in the result Series
    result['cr_hebergement_ttc'] = ttc_hebergement
    result['cr_hebergement_ht'] = ht_hebergement
    result['cr_hebergement_tva'] = tva_hebergement
    result['cr_service_ttc'] = ttc_service
    result['cr_service_ht'] = ht_service
    result['cr_service_tva'] = tva_service
    result['cr_total_ttc'] = total_ttc
    result['cr_total_ht'] = total_ht
    result['cr_total_tva'] = total_tva
    
    return result



In [None]:

# First get the data if not already retrieved
if 'df' not in locals():
    df = get_dataCR()
# Apply the function to each row in the dataframe to calculate all values
# Apply method is used to apply a function along the axis of the DataFrame
# axis=1 applies the function to each row, axis=0 would apply it to each column
# returns a new DataFrame with the results of the function applied to each row
df_cr_verified = df.apply(compute_hebergement_service, axis=1)
# No need for copying and column replacement
df_cr_verified.head()

In [None]:

# Check if the column names are identical in the original and processed dataframes
# set is used to compare the unique values in each set of columns
original_columns = set(df.columns)
processed_columns = set(df_hebergement_service.columns)

# Check if the sets of columns are identical
columns_are_same = original_columns == processed_columns

# Display the result
print(f"Column names are identical: {columns_are_same}")

# If they're not the same, show the differences
if not columns_are_same:
    print("Columns only in original df:", original_columns - processed_columns)
    print("Columns only in processed df:", processed_columns - original_columns)
else:
    print("Both dataframes have exactly the same columns.")




In [None]:
# Check for duplicate columns in the dataframe
duplicate_cols = df_cr_verified.columns[df_cr_verified.columns.duplicated()]
print(f"Duplicate columns: {duplicate_cols.tolist()}")

# Get first occurrence of each column name
unique_columns = ~df_cr_verified.columns.duplicated(keep='first')
df_unique_cols = df_cr_verified.loc[:, unique_columns]

# Now filter the dataframe with unique columns
filtered_df = df_unique_cols[(df_unique_cols['id_client'] == 5) & (df_unique_cols['id_dossier'] == 125)]

# Display the filtered dataframe
print(f"Found {len(filtered_df)} rows with id_client=5 and id_dossier=125")
filtered_df.head()