In [None]:
import pandas as pd
import numpy as np

def merge_all_data(user_health_data_path, supplement_usage_path, experiments_path, user_profiles_path):
    """
    Cleans and merges user health, supplement, experiment, and profile data
    into a single comprehensive DataFrame.

    Args:
        user_health_data_path (str): File path to user_health_data.csv.
        supplement_usage_path (str): File path to supplement_usage.csv.
        experiments_path (str): File path to experiments.csv.
        user_profiles_path (str): File path to user_profiles.csv.

    Returns:
        pandas.DataFrame: A single, cleaned, and merged DataFrame with
                          one row per (user_id, date).
    """
    
    # 1. --- Load all datasets ---
    
    df_health = pd.read_csv(user_health_data_path)
    df_supp = pd.read_csv(supplement_usage_path)
    df_exp = pd.read_csv(experiments_path)
    df_profiles = pd.read_csv(user_profiles_path)
    

    # 2. --- Prepare Health Data (df_health) ---
        
    # Clean 'sleep_hours' column
    # Data contains string values like '8.8h' and '8.0H'.
    # Strip the 'h' (case-insensitive) and convert to numeric.
    if 'sleep_hours' in df_health.columns:
            df_health['sleep_hours'] = df_health['sleep_hours'].astype(str).str.replace(r'[hH]', '', regex=True)
            df_health['sleep_hours'] = pd.to_numeric(df_health['sleep_hours'], errors='coerce')
        
        # Standardize 'date' column
            df_health['date'] = pd.to_datetime(df_health['date'], errors='coerce')


    # 3. --- Prepare Profile Data (df_profiles) ---
        
        # Define bins and labels for age grouping
    bins = [0, 17, 25, 35, 45, 55, 65, np.inf]
    labels = ['Under 18', '18-25', '26-35', '36-45', '46-55', '56-65', 'Over 65']
        
        # Create the 'user_age_group' column from 'age'
    df_profiles['user_age_group'] = pd.cut(df_profiles['age'],
                                            bins=bins,
                                            labels=labels,
                                            right=True)
        
        # Fill any missing age groups (where 'age' was NaN) with 'Unknown'
    df_profiles['user_age_group'] = df_profiles['user_age_group'].cat.add_categories('Unknown').fillna('Unknown')
        
        # Select only the columns needed for the final merge
    df_profiles_prepared = df_profiles[['user_id', 'email', 'user_age_group']]

    # 4. --- Prepare Supplement Data (df_supp) ---
        
    # Convert dosages from 'mg' to 'g'.
    df_supp['dosage_grams'] = np.where(df_supp['dosage_unit'] == 'mg',
                                        df_supp['dosage'] / 1000,
                                        df_supp['dosage'])

    # Rename the 'name' column in df_exp to 'experiment_name' for clarity
    df_exp = df_exp.rename(columns={'name': 'experiment_name'}).astype('category')


    # Join supplement data with experiment data to get 'experiment_name'
    df_supp_prepared = pd.merge(df_supp,
                                df_exp[['experiment_id', 'experiment_name']],
                                on='experiment_id',
                                how='left')

    # Standardize 'date' column
    df_supp_prepared['date'] = pd.to_datetime(df_supp_prepared['date'], errors='coerce')

    # Select only the columns needed for the main merge
    df_supp_prepared = df_supp_prepared[['user_id', 'date', 'supplement_name',
                                            'dosage_grams', 'is_placebo', 'experiment_name']]


    # 5. --- Merge Core Data (Health & Supplements) ---

    # Use a 'full outer' join on (user_id, date)
    df_merged = pd.merge(df_health,
                            df_supp_prepared,
                            on=['user_id', 'date'],
                            how='outer')

    # 6. --- Join Profile Data ---

    # Use a 'left' join to add the static user profile info
    df_final = pd.merge(df_merged,
                        df_profiles_prepared,
                        on='user_id',
                        how='left')

    # 7. --- Final Cleaning & Formatting ---

    # Fill missing 'supplement_name' with 'No intake'
    df_final['supplement_name'] = df_final['supplement_name'].fillna('No intake')

    # Convert 'is_placebo' to pandas' nullable boolean type ('boolean')
    df_final['is_placebo'] = df_final['is_placebo'].astype('boolean')

    # NOTE ON user_id TYPE:
    # 'user_id' is left as 'object' (string) type because the source
    # data contains non-integer values, which contradicts the schema.

    # Define the final column order
    final_column_order = [
        'user_id', 'date', 'email', 'user_age_group', 'experiment_name',
        'supplement_name', 'dosage_grams', 'is_placebo', 'average_heart_rate',
        'average_glucose', 'sleep_hours', 'activity_level'
    ]

    # Reorder the DataFrame columns
    df_final = df_final[final_column_order]
    df_final.info()

    return df_final

merge_all_data('user_health_data.csv', 'supplement_usage.csv', 'experiments.csv', 'user_profiles.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2721 entries, 0 to 2720
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             2721 non-null   object        
 1   date                2721 non-null   datetime64[ns]
 2   email               2721 non-null   object        
 3   user_age_group      2721 non-null   category      
 4   experiment_name     2000 non-null   category      
 5   supplement_name     2721 non-null   object        
 6   dosage_grams        2000 non-null   float64       
 7   is_placebo          2000 non-null   boolean       
 8   average_heart_rate  2721 non-null   float64       
 9   average_glucose     2721 non-null   float64       
 10  sleep_hours         2721 non-null   float64       
 11  activity_level      2721 non-null   int64         
dtypes: boolean(1), category(2), datetime64[ns](1), float64(4), int64(1), object(3)
memory usage: 202.6+ KB


Unnamed: 0,user_id,date,email,user_age_group,experiment_name,supplement_name,dosage_grams,is_placebo,average_heart_rate,average_glucose,sleep_hours,activity_level
0,005dd64e-3863-49f3-93b0-1aea9a84d526,2018-01-31,user_65@myemail.com,26-35,Memory,Placebo,0.170989,False,84.172177,96.814856,11.4,2
1,005dd64e-3863-49f3-93b0-1aea9a84d526,2018-02-28,user_65@myemail.com,26-35,Sleep Quality,Magnesium,0.494938,False,85.200747,130.836935,5.8,2
2,005dd64e-3863-49f3-93b0-1aea9a84d526,2018-03-31,user_65@myemail.com,26-35,Sleep Quality,Placebo,0.184596,False,76.695318,109.782341,7.0,1
3,005dd64e-3863-49f3-93b0-1aea9a84d526,2018-04-30,user_65@myemail.com,26-35,Endurance,Omega-3,0.313256,True,65.772482,100.765545,7.5,1
4,016e4095-8b5d-43d9-83fd-e28b38be2f7d,2018-01-31,contact_390@email.com,36-45,,No intake,,,86.762141,103.689177,8.2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2716,ff0e1eb5-1deb-4ddc-9ce9-548706be6bef,2018-04-30,contact_299@myemail.com,26-35,Endurance,Magnesium,0.163610,False,78.136649,86.144705,5.3,2
2717,ffaccfef-220a-4976-96f7-22e4f2fc3ecc,2018-01-31,user323@email.com,46-55,Strength,Vitamin C,0.407786,False,80.122904,134.196895,11.2,1
2718,ffaccfef-220a-4976-96f7-22e4f2fc3ecc,2018-02-28,user323@email.com,46-55,Recovery,Omega-3,0.350319,False,63.316583,79.378241,11.6,1
2719,ffaccfef-220a-4976-96f7-22e4f2fc3ecc,2018-03-31,user323@email.com,46-55,Recovery,Zinc,0.111641,False,84.800053,117.005827,11.1,3
