In [None]:
# Function to calculate completion time
def calculate_completion_times(df, max_time_hours=1):

     # Ensure date_time is in datetime format
    df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')

    # Drop rows where date_time could not be converted
    df = df.dropna(subset=['date_time'])
    # Sort by client_id, visit_id, and date_time
    df = df.sort_values(by=['client_id', 'visit_id', 'date_time'])

    # Calculate time spent between steps
    df['time_spent'] = df.groupby(['client_id', 'visit_id'])['date_time'].diff()

    # Remove rows with NaT in 'time_spent'
    df = df.dropna(subset=['time_spent'])

    # Extract the last 'start' and 'confirm' times for each client
    df_last_start = df[df['process_step'] == 'start'].groupby(['client_id', 'visit_id']).last().reset_index()
    df_last_confirm = df[df['process_step'] == 'confirm'].groupby(['client_id', 'visit_id']).last().reset_index()

    # Merge to get both 'start' and 'confirm' times
    df_merged = pd.merge(df_last_start[['client_id', 'visit_id', 'date_time']],
                         df_last_confirm[['client_id', 'visit_id', 'date_time']],
                         on=['client_id', 'visit_id'], suffixes=('_start', '_confirm'))

    # Calculate the time to conversion in seconds
    df_merged['completion_time_seconds'] = (df_merged['date_time_confirm'] - df_merged['date_time_start']).dt.total_seconds()

    # Filter out times exceeding max_time_hours
    df_filtered_times = df_merged[df_merged['completion_time_seconds'] <= max_time_hours * 3600]

    # Return completion times in minutes
    return df_filtered_times['completion_time_seconds'] / 60

In [None]:
# Function to calculate error rate (proportion of clients who revisited a previous step)
def calculate_error_data(df):
    # Create a DataFrame with steps and their occurrence order for each client
    steps_df = df[['client_id', 'process_step']].copy()
    steps_df['step_order'] = steps_df.groupby('client_id').cumcount()

    # Merge the steps DataFrame with itself to find revisits
    merged = pd.merge(steps_df, steps_df, on='client_id', suffixes=('_first', '_second'))
    revisits = merged[(merged['step_order_second'] < merged['step_order_first']) & (merged['process_step_first'] == merged['process_step_second'])]

    # Count unique clients with any revisit
    clients_with_revisits = revisits['client_id'].unique()
    total_clients = df['client_id'].nunique()

    # Return the error count and total client count
    return len(clients_with_revisits), total_clients

In [None]:
# Function to calculate Group completion times by age group
def group_completion_times_by_age(df):
    grouped = df.groupby('age_group', observed=True)
    results = {}
    for name, group in grouped:
        results[name] = calculate_completion_times(group).values
    return results

# Calculate completion times for both test and control groups
test_age_groups = group_completion_times_by_age(test_df)
control_age_groups = group_completion_times_by_age(control_df)

In [None]:
#Function for Anova testing
def perform_anova(data):
    return f_oneway(*data.values())

anova_test_results = perform_anova(test_age_groups)
anova_control_results = perform_anova(control_age_groups)

print("ANOVA Results for Test Group:")
print(f"F-statistic: {anova_test_results.statistic:.4f}, P-value: {anova_test_results.pvalue:.4f}")

if anova_test_results.pvalue < 0.05:
    print("There is a significant difference in completion times across age groups in the Test group.")
else:
    print("There is no significant difference in completion times across age groups in the Test group.")