# 1. Import Libraries

In [35]:
import pandas as pd
from pandas import read_csv
from statsmodels.stats.proportion import proportions_ztest



df=pd.read_csv("Quant_Research/data/model_data.csv")

# 2. EDA

In [36]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Salary,unique_id,Supermarket,Online Grocery User,Convenience Shops,App_Excitement,Average Weekly Spending,Free Delivery Gambit,Tried Online Grocery,Cheapest prices Rank,Convenience Rank,Catalogue Rank,App Features Rank,Private Label,Expected Standard Delivery,Considered Fast Delivery
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,184.844828,34.5,4009.47969,184.844828,0.948276,0.20977,0.025862,8.238772,111.296771,0.813218,0.479885,1.37069,2.031609,2.873563,3.724138,0.647074,28.878096,20.732184
std,108.162004,10.908712,7007.627575,108.162004,0.221788,0.407731,0.158952,1.454171,53.889352,0.390297,0.500315,0.794761,0.636662,0.658434,0.634088,0.408666,13.186491,11.532178
min,0.0,0.0,800.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0
25%,87.75,27.0,1075.0,87.75,1.0,0.0,0.0,8.247423,75.0,1.0,0.0,1.0,2.0,3.0,4.0,0.0,20.0,15.0
50%,187.5,33.0,2300.0,187.5,1.0,0.0,0.0,8.247423,86.65724,1.0,0.0,1.0,2.0,3.0,4.0,0.636364,28.909434,20.0
75%,277.25,40.25,4042.248418,277.25,1.0,0.0,0.0,8.247423,146.835443,1.0,1.0,1.0,2.0,3.0,4.0,1.0,30.0,20.6
max,374.0,74.0,80834.0,374.0,1.0,1.0,1.0,10.0,200.0,1.0,1.0,4.0,4.0,4.0,4.0,1.0,60.0,60.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  348 non-null    int64  
 1   Gender                      338 non-null    object 
 2   Nationality                 348 non-null    object 
 3   Age                         348 non-null    int64  
 4   Salary                      348 non-null    float64
 5   unique_id                   348 non-null    int64  
 6   Address                     348 non-null    object 
 7   Supermarket                 348 non-null    float64
 8   Online Grocery User         348 non-null    float64
 9   Convenience Shops           348 non-null    float64
 10  App_Excitement              348 non-null    float64
 11  Average Weekly Spending     348 non-null    float64
 12  Free Delivery Gambit        348 non-null    float64
 13  Customer Segment            348 non

### Consumer Weekly Spend

In [38]:
customer_weekly_spend = round(df["Average Weekly Spending"].mean(),2)
print(f"Consumer weekly spend is {customer_weekly_spend} AED")

Consumer weekly spend is 111.3 AED


### Features Rankings

In [39]:
#lower rank = higher priority
print(f"Average rank of cheapest prices is {round(df['Cheapest prices Rank'].mean(),2)}")
print(f"Average rank of convenience is {round(df['Convenience Rank'].mean(),2)}")
print(f"Average rank of catalogue is {round(df['Catalogue Rank'].mean(),2)}")
print(f"Average rank of app features is {round(df['App Features Rank'].mean(),2)}")

Average rank of cheapest prices is 1.37
Average rank of convenience is 2.03
Average rank of catalogue is 2.87
Average rank of app features is 3.72


# 3. Hypothesis Testing

## Hypothesis 1

In [40]:
#Filter rows that are bachelors and those that are families
# First, let's check what columns are actually in the dataframe
print("Columns in dataframe:", df.columns.tolist())

# Then, let's check if there's a similar column to "Customer Segment"
# Look for columns that might contain customer segmentation information
segment_columns = [col for col in df.columns if 'segment' in col.lower() or 'customer' in col.lower()]
print("Potential segment columns:", segment_columns)

# If we find a matching column, let's see what values it contains
if segment_columns:
    for col in segment_columns:
        print(f"\nUnique values in {col}:")
        print(df[col].unique())

# Now let's try with the correct column name (assuming we found it above)
# Replace 'Customer Segment' with the actual column name if different
segment_column = 'Customer Segment'  # Update this based on the output above

# Check if the column exists before filtering
if segment_column in df.columns:
    bachelor_values = ["Bachelor living with others", "Bachelor living alone"]
    bachelors = df[df[segment_column].isin(bachelor_values)]
    
    family_values = ["Family Living Together"]
    families = df[df[segment_column].isin(family_values)]
    
    print(f"\nNumber of bachelors: {len(bachelors)}")
    print(f"Number of families: {len(families)}")
else:
    print(f"\nColumn '{segment_column}' not found in the dataframe")
print("--------------------------------------------------------")
df["Tried Online Grocery"] = df["Tried Online Grocery"].astype(int)
df["Free Delivery Gambit"] = df["Free Delivery Gambit"].astype(int)

# 3) Filter to the non–online-grocery users
non_users = df[df["Tried Online Grocery"] == 0]
print("Number of non–online-grocery users:", len(non_users))

# 4) Among these non-users, how many said they WOULD try if free?
count_would_try = non_users["Free Delivery Gambit"].sum()  # sum of 1s
n_non_users = len(non_users)
observed_proportion = count_would_try / n_non_users if n_non_users > 0 else 0

print(f"Out of {n_non_users} non-users, {count_would_try} said they'd try if free.")
print(f"Observed proportion: {observed_proportion:.3f}")

# Hypothesized proportion
p0 = 0.5

# Perform the 1-sample proportion z-test
#   alternative='larger' checks if observed proportion > p0


stat, p_value = proportions_ztest(count_would_try, n_non_users, value=p0, alternative='larger')
print("")
print("1-Sample Proportion Test (Non-Online-Users Will Try if Free)")
print(f"  Hypothesized proportion = {p0}")
print(f"  Observed proportion     = {observed_proportion:.3f}")
print(f"  z-statistic             = {stat:.3f}")
print(f"  p-value                 = {p_value:.5f}")

# Conclusion
alpha = 0.05
if p_value < alpha:
    print(f"p-value < {alpha}; we reject the null hypothesis and conclude",
          "the proportion is significantly greater than 0.50.")
else:
    print(f"p-value >= {alpha}; we do NOT reject the null hypothesis,",
          "insufficient evidence that proportion > 0.50.")

Columns in dataframe: ['Unnamed: 0', 'Gender', 'Nationality', 'Age', 'Salary', 'unique_id', 'Address', 'Supermarket', 'Online Grocery User', 'Convenience Shops', 'App_Excitement', 'Average Weekly Spending', 'Free Delivery Gambit', 'Customer Segment', 'Tried Online Grocery', 'Cheapest prices Rank', 'Convenience Rank', 'Catalogue Rank', 'App Features Rank', 'Private Label', 'Expected Standard Delivery', 'Considered Fast Delivery', 'Emirate']
Potential segment columns: ['Customer Segment']

Unique values in Customer Segment:
['Family Living Together' 'Bachelor living with others'
 'Bachelor living alone']

Number of bachelors: 173
Number of families: 175
--------------------------------------------------------
Number of non–online-grocery users: 181
Out of 181 non-users, 136 said they'd try if free.
Observed proportion: 0.751

1-Sample Proportion Test (Non-Online-Users Will Try if Free)
  Hypothesized proportion = 0.5
  Observed proportion     = 0.751
  z-statistic             = 7.825
  p

## Hypothesis 2

In [41]:
# For bachelors
bachelor_price_avg = bachelors["Cheapest prices Rank"].mean()
bachelor_convenience_avg = bachelors["Convenience Rank"].mean()

# For families
family_price_avg = families["Cheapest prices Rank"].mean()
family_convenience_avg = families["Convenience Rank"].mean()

# Print results
print("Average Ranks (lower is higher priority):")
print(f"Bachelors - Price: {bachelor_price_avg:.2f}, Convenience: {bachelor_convenience_avg:.2f}")
print(f"Families - Price: {family_price_avg:.2f}, Convenience: {family_convenience_avg:.2f}")

# Check if bachelors prioritize price more than convenience
bachelor_prioritize_price = bachelor_price_avg < bachelor_convenience_avg
print(f"\nDo bachelors prioritize price over convenience? {bachelor_prioritize_price}")

# Check if families prioritize convenience more than price
family_prioritize_convenience = family_convenience_avg < family_price_avg
print(f"Do families prioritize convenience over price? {family_prioritize_convenience}")

# Statistical testing - compare price ranks between bachelors and families
from scipy import stats

# Test if bachelors rank price significantly lower (higher priority) than families
price_ttest = stats.ttest_ind(bachelors["Cheapest prices Rank"], 
                             families["Cheapest prices Rank"], 
                             equal_var=False)
print("\nT-test for price ranks (bachelors vs families):")
print(f"t-statistic: {price_ttest.statistic:.3f}")
print(f"p-value: {price_ttest.pvalue:.5f}")

# Test if families rank convenience significantly lower (higher priority) than bachelors
convenience_ttest = stats.ttest_ind(families["Convenience Rank"], 
                                   bachelors["Convenience Rank"], 
                                   equal_var=False)
print("\nT-test for convenience ranks (families vs bachelors):")
print(f"t-statistic: {convenience_ttest.statistic:.3f}")
print(f"p-value: {convenience_ttest.pvalue:.5f}")

# Conclusion
alpha = 0.05
print("\nConclusion:")
if bachelor_prioritize_price and price_ttest.pvalue < alpha:
    print("- Bachelors prioritize price significantly more than families")
else:
    print("- No significant evidence that bachelors prioritize price more than families")

if family_prioritize_convenience and convenience_ttest.pvalue < alpha:
    print("- Families prioritize convenience significantly more than bachelors")
else:
    print("- No significant evidence that families prioritize convenience more than bachelors")

Average Ranks (lower is higher priority):
Bachelors - Price: 1.26, Convenience: 1.98
Families - Price: 1.48, Convenience: 2.09

Do bachelors prioritize price over convenience? True
Do families prioritize convenience over price? False

T-test for price ranks (bachelors vs families):
t-statistic: -2.605
p-value: 0.00959

T-test for convenience ranks (families vs bachelors):
t-statistic: 1.602
p-value: 0.11020

Conclusion:
- Bachelors prioritize price significantly more than families
- No significant evidence that families prioritize convenience more than bachelors


## Hypothesis 3

## Hypothesis 4

In [42]:
# Overall Mean
df["Considered Fast Delivery"].mean()

#Non-Online-Grocery Users
non_users = df[df["Tried Online Grocery"] == 0]
non_users["Considered Fast Delivery"].mean()

#Online-Grocery Users
online_users = df[df["Tried Online Grocery"] == 1]
online_users["Considered Fast Delivery"].mean()

print(f"Overall Mean: {df['Considered Fast Delivery'].mean():.2f}")
print(f"Non-Online-Grocery Users Mean: {non_users['Considered Fast Delivery'].mean():.2f}")
print(f"Online-Grocery Users Mean: {online_users['Considered Fast Delivery'].mean():.2f}")

fast_delivery = df["Considered Fast Delivery"]

# One-sample t-test against a population mean of 30 minutes
t_stat, p_value = stats.ttest_1samp(fast_delivery, popmean=30)

print("One-sample t-test for 'Considered Fast Delivery':")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.5f}")

# If you expect consumers to consider fast delivery as lower than 30,
# you might do a one-tailed test (if t_stat is negative)
if t_stat < 0:
    p_value_one_tailed = p_value / 2
    print(f"One-tailed p-value: {p_value_one_tailed:.5f}")

Overall Mean: 20.73
Non-Online-Grocery Users Mean: 22.73
Online-Grocery Users Mean: 18.57
One-sample t-test for 'Considered Fast Delivery':
t-statistic: -14.992
p-value: 0.00000
One-tailed p-value: 0.00000


## Hypothesis 5

In [43]:
# Overall Mean
df["Expected Standard Delivery"].mean()

#Non-Online-Grocery Users
non_users = df[df["Tried Online Grocery"] == 0]
non_users["Expected Standard Delivery"].mean()

#Online-Grocery Users
online_users = df[df["Tried Online Grocery"] == 1]
online_users["Expected Standard Delivery"].mean()

print(f"Overall Mean: {df['Expected Standard Delivery'].mean():.2f}")
print(f"Non-Online-Grocery Users Mean: {non_users['Expected Standard Delivery'].mean():.2f}")
print(f"Online-Grocery Users Mean: {online_users['Expected Standard Delivery'].mean():.2f}")

Overall Mean: 28.88
Non-Online-Grocery Users Mean: 27.80
Online-Grocery Users Mean: 30.05
