# A/B Testing

In [1]:
# Data processing
# -----------------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualization
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluate linearity of relationships between variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import mannwhitneyu

# Configuration
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Management of warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/clean_customer_data.csv")
df.head()

Unnamed: 0,loyalty_number,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed,province,city,postal_code,gender,education,salary,marital_status,loyalty_card,clv,enrollment_type,enrollment_year,enrollment_month,cancellation_year,cancellation_month
0,100018,2017,1,3,0,3,1521,152.0,0,0,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,0.0,0.0
1,100018,2017,2,2,2,4,1320,132.0,0,0,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,0.0,0.0
2,100018,2018,10,6,4,10,3110,311.0,385,31,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,0.0,0.0
3,100018,2017,4,4,0,4,924,92.0,0,0,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,0.0,0.0
4,100018,2017,5,0,0,0,0,0.0,0,0,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,0.0,0.0


In [3]:
# Filter the dataset to include only the relevant columns: 'Flights Booked' and 'Education'.

df_filtered = df[['flights_booked', 'education']]
df_filtered.sample(5)

Unnamed: 0,flights_booked,education
342661,0,College
232804,3,Bachelor
318553,0,Bachelor
270754,0,Bachelor
144636,0,Bachelor


In [4]:
# Groups the data by educational level and calculates basic descriptive statistics (such as mean, standard deviation, percentiles) of the number of flights booked for each group.

education_data = df_filtered.groupby('education')['flights_booked'].describe()
education_data

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bachelor,252567.0,4.110288,5.221671,0.0,0.0,1.0,8.0,21.0
College,102260.0,4.169744,5.24604,0.0,0.0,1.0,8.0,21.0
Doctor,17731.0,4.175512,5.256971,0.0,0.0,1.0,8.0,21.0
High School or Below,18915.0,4.176209,5.239267,0.0,0.0,1.0,8.0,21.0
Master,12287.0,4.2007,5.213956,0.0,0.0,1.0,8.0,21.0


In [5]:
# Grouping the education levels into 'University' and 'Non-university'
df['education_group'] = df['education'].apply(
    lambda x: 'University' if x in ['Bachelor', 'Master', 'Doctor'] else 'Non university'
)

# Filter data for each group
university_flights = df[df['education_group'] == 'University']['flights_booked']
non_university_flights = df[df['education_group'] == 'Non university']['flights_booked']

In [6]:
def normality(group, group_name):
    """
    Evaluates the normality of a data column in a DataFrame using the Shapiro-Wilk test.

    Parameters:
        group (Series): The data column to evaluate for normality.
        group_name (str): The name of the group being evaluated.

    Returns:
        None: Prints a message indicating whether the data follows a normal distribution.
    """

    statistic, p_value = stats.shapiro(group)
    if p_value > 0.05:
        print(f"For the {group_name} group, the data follows a normal distribution.")
    else:
        print(f"For the {group_name} group, the data does not follow a normal distribution.")
    print("\n")

In [7]:
# Check normality for both groups
normality(university_flights, 'University')
normality(non_university_flights, 'Non university')

For the University group, the data does not follow a normal distribution.


For the Non university group, the data does not follow a normal distribution.




In [8]:
def mann_whitney_test(dataframe, metric_column, group_control, group_test, group_column='education_group'):
    """
    Performs the Mann-Whitney U test to compare the medians of a metric between two groups in a given DataFrame.

    Parameters:
    - dataframe (DataFrame): The DataFrame containing the data.
    - metric_column (str or list): The name of the column or list of columns representing the metrics to compare between the groups.
    - group_control (str): The name of the control group in the column specified by group_column.
    - group_test (str): The name of the test group in the column specified by group_column.
    - group_column (str): The name of the column containing the group information. Default is "education_group".

    Returns:
    None: Prints to the console whether the medians are different or the same for the metric.
    The Mann-Whitney U test is used to evaluate if there are significant differences between the groups.
    """
    
    # Filter the DataFrame to include only the data for the two groups
    control = dataframe[dataframe[group_column] == group_control]
    test = dataframe[dataframe[group_column] == group_test]
    
    # Ensure metric_column is a list
    if isinstance(metric_column, str):
        metric_column = [metric_column]
    
    for metric in metric_column:
        metric_control = control[metric]
        metric_test = test[metric]

        # Apply the Mann-Whitney U test
        u_statistic, p_value = stats.mannwhitneyu(metric_control, metric_test)
        
        if p_value < 0.05:
            print(f"  The medians are different between {group_control} and {group_test}.")
        else:
            print(f"  The medians are the same between {group_control} and {group_test}.")
        print("\n")

# Call the function
mann_whitney_test(df, 'flights_booked', 'University', 'Non university')

  The medians are different between University and Non university.




# Interpretation of results

- The median flights_booked is significantly different between the "University" and "Non-University" groups.
- This suggests that the number of flights booked tends to be different between these two educational groups.

If the median number of flights booked by college students is higher, this could imply that college-educated people tend to travel more.