In [1]:
import pandas as pd
import numpy as np
import datetime

def merge_all_data(user_health_data, supplement_usage, experiments, user_profiles):
  # load each csv file into a pandas DataFrame (accepts file paths)
  user_health_data = pd.read_csv('user_health_data.csv')
  supplement_usage = pd.read_csv('supplement_usage.csv')
  experiments = pd.read_csv('experiments.csv')
  user_profiles = pd.read_csv('user_profiles.csv')
  
  #remove spaces from column names and replace with underscores
  user_health_data.columns = user_health_data.columns.str.replace(' ', '_')
  supplement_usage.columns = supplement_usage.columns.str.replace(' ', '_')
  experiments.columns = experiments.columns.str.replace(' ', '_')
  user_profiles.columns = user_profiles.columns.str.replace(' ', '_')
  
  #convert data frames to lowercase column names
  user_health_data.columns = user_health_data.columns.str.lower()
  supplement_usage.columns = supplement_usage.columns.str.lower()
  experiments.columns = experiments.columns.str.lower()
  user_profiles.columns = user_profiles.columns.str.lower() 
  
  #remove duplicates rows from each data frame
  user_health_data = user_health_data.drop_duplicates()
  supplement_usage = supplement_usage.drop_duplicates()
  experiments = experiments.drop_duplicates()
  user_profiles = user_profiles.drop_duplicates()

  # compute user_age_group on user_profiles
  current_year = datetime.date.today().year
  if 'age' in user_profiles.columns:
    user_profiles['__age_calc'] = pd.to_numeric(user_profiles['age'], errors='coerce')
  elif 'birth_year' in user_profiles.columns:
    user_profiles['__age_calc'] = current_year - pd.to_numeric(user_profiles['birth_year'], errors='coerce')
  else:
    user_profiles['__age_calc'] = np.nan

  def _age_group(a):
    if pd.isna(a):
      return 'Unknown'
    a = int(a)
    if a < 18:
      return 'Under 18'
    if a <= 25:
      return '18-25'
    if a <= 35:
      return '26-35'
    if a <= 45:
      return '36-45'
    if a <= 55:
      return '46-55'
    if a <= 65:
      return '56-65'
    return 'Over 65'

  user_profiles['user_age_group'] = user_profiles['__age_calc'].apply(_age_group)
  user_profiles = user_profiles.drop(columns=['__age_calc'], errors='ignore')

  # ensure date columns are parsed if present
  if 'date' in user_health_data.columns:
    user_health_data['date'] = pd.to_datetime(user_health_data['date'], errors='coerce')
  if 'date' in supplement_usage.columns:
    supplement_usage['date'] = pd.to_datetime(supplement_usage['date'], errors='coerce')

  # merge in the requested order: profiles -> health -> supplement -> experiments
  merged = pd.merge(user_profiles, user_health_data, on='user_id', how='left')
  merged = pd.merge(merged, supplement_usage, on=['user_id', 'date'], how='left')
  merged = pd.merge(merged, experiments, on='user_id', how='left')

  # encode days without intake as 'No intake' for supplement_name if present
  if 'supplement_name' in merged.columns:
    merged['supplement_name'] = merged['supplement_name'].fillna('No intake')

  # convert dosage_mg to dosage_grams if needed
  if 'dosage_mg' in merged.columns and 'dosage_grams' not in merged.columns:
    merged['dosage_grams'] = merged['dosage_mg'] / 1000

  return merged


