### Define hypothese types

independent samples t-test (ttest_ind)
compare the **means of two different groups. The groups should not be related
if p<0.05 means are significantly different.

### Paired t-test : two related groups — like before and after (ttest_rel) within same population
Test scores before and after a experiment
if p < 0.05, the change is statistically significant.



In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import scipy.stats as st
import numpy as np


In [59]:
def read_jsonl_to_dataframe(filepath):
    """ Reads a JSON Lines file into a Pandas DataFrame."""
    try:
        df = pd.read_json(filepath, lines=True, convert_dates=False)
        return df
    except Exception as e:
        print(f"Error reading jsonl to dataframe: {e}")
        return None

In [60]:
food_data = "llm_output.json"

df = read_jsonl_to_dataframe(food_data)
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['image_month'] = df['date'].dt.strftime('%Y-%m')

# 2 = Healthy, 1 = Balanced, 0 = 'Unhealthy'
health_value_map = {
    'healthy':2,
    'Balanced':1,
    'Unhealthy':0,
    'Unknown':1,
    'No':0,
    'balanced':1,
    'Yes':2,
    'unhealthy':0,
    'Healthy':2
}

gen_map = {
    'No':0,
    'Low':1,
    'Medium':2,
    'High':3
}

processing_map = {
    'Minimally Processed':1,
    'Processed':2,
    'Unprocessed':0
}


df_orig = df.copy()
df['Healthy'] = df['Healthy'].map(health_value_map)
df['Salt'] = df['Salt'].map(gen_map)
df['Sugar'] = df['Sugar'].map(gen_map)
df['Food_Diversity'] = df['Food_Diversity'].map(gen_map)
df['Processing_level']= df['Processing_level'].map(processing_map)

df.sample(5)

Unnamed: 0,name,size,date,Cuisine,Happiness_Level,Meal_Course,Sugar,Salt,Healthy,Processing_level,Preparation_Method,Dominant_Color,Food_Diversity,image_month
57,Food/image(44).jpg,2292199,NaT,Unknown,4,Drink,1,1,1,1,Unknown,Red,1,
259,Food/PXL_20221222_142005359.MP.jpg,4058173,2022-12-22,Unknown,4,Drink,2,1,1,2,Unknown,Brown,1,2022-12
60,Food/PXL_20240620_110553482.jpg,3109757,2024-06-20,Fusion,4,Main Course,1,2,1,1,"Steamed, Fried",Brown,3,2024-06
271,Food/PXL_20250414_110304284.jpg,1601182,2025-04-14,Indian,3,Side Dish,1,2,2,1,Unknown,Green,1,2025-04
218,Food/image(8).jpg,1546412,NaT,Unknown,4,Dessert,3,1,0,2,Unknown,Brown,1,


In [61]:
df_n = df[['Happiness_Level', 'Sugar', 'Salt', 'Healthy','Processing_level', 'Food_Diversity']]
df_n.corr()

Unnamed: 0,Happiness_Level,Sugar,Salt,Healthy,Processing_level,Food_Diversity
Happiness_Level,1.0,0.290237,-0.037341,-0.029132,0.108371,0.093414
Sugar,0.290237,1.0,-0.55117,-0.514042,0.664263,-0.306616
Salt,-0.037341,-0.55117,1.0,0.096176,-0.331087,0.432985
Healthy,-0.029132,-0.514042,0.096176,1.0,-0.618172,0.260545
Processing_level,0.108371,0.664263,-0.331087,-0.618172,1.0,-0.30299
Food_Diversity,0.093414,-0.306616,0.432985,0.260545,-0.30299,1.0


In [62]:
def t_test_features(s1, s2, features):
    """Test means of a feature set of two samples
    
    Args:
        s1 (dataframe): sample 1
        s2 (dataframe): sample 2
        features (list): an array of features to test
    
    Returns:
        dict: a dictionary of t-test scores for each feature where the feature name is the key and the p-value is the value
    """
    results = {}
    for f in features:
        t_stat, p_val = st.ttest_ind(s1[f], s2[f])
        results[f] = p_val

    return results

### Hypothesis: Cusines vs. rest
H0 : Cuisine is not related to healthy choices

H0: Cusines is not related to happiness

In [63]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

food_div3 = df[df['Food_Diversity']==3]
food_div1 = df[df['Food_Diversity']==1]
food_crse_fusion = df[df['Cuisine']=='Fusion']
food_crse_indian = df[df['Cuisine']=='Indian']
food_crse_Italian = df[df['Cuisine']=='Italian']

res = t_test_features(food_div3,food_div1,f_lst)
res


{'Happiness_Level': np.float64(0.17760715468273314),
 'Sugar': np.float64(9.99017229978088e-06),
 'Salt': np.float64(6.267788895285081e-10),
 'Healthy': np.float64(1.0319079422408495e-05),
 'Processing_level': np.float64(2.1141829263598602e-05)}

#### Happiness level is invariable in diversity raw processed and high processed food, mostly same

(Sugar/Salt/Healthy/Processing level) are different between raw processed and high processed food

### Summary : food diversity share is similar to happiness level and mostly related

In [64]:
res = t_test_features(food_crse_fusion,food_crse_indian,f_lst)
res

{'Happiness_Level': np.float64(0.20775180166121093),
 'Sugar': np.float64(0.34302454194036813),
 'Salt': np.float64(0.046241135549500705),
 'Healthy': np.float64(6.829220051709067e-05),
 'Processing_level': np.float64(0.6287771166261774)}

In [89]:
res = t_test_features(food_crse_Italian,food_crse_indian,f_lst)
res

{'Happiness_Level': np.float64(0.8475792953458055),
 'Sugar': np.float64(0.0059095115126605235),
 'Salt': np.float64(5.512573370762072e-06),
 'Healthy': np.float64(0.1781992830930098),
 'Processing_level': np.float64(3.1994738348061325e-06)}

In [90]:
res = t_test_features(food_crse_Italian,food_crse_fusion,f_lst)
res

{'Happiness_Level': np.float64(0.3816913824253566),
 'Sugar': np.float64(0.21141473968454622),
 'Salt': np.float64(0.04632000103069966),
 'Healthy': np.float64(5.147404125830818e-05),
 'Processing_level': np.float64(0.0018592877484584418)}

# chi square and chi contingency test

### Data Prep

In [65]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_food = df[df['Cuisine'].isin(cuisines)]

### Validate relationships

In [66]:
def perform_and_print_chi2(data_name, contingency_table):
    """
    Performs a Chi-squared test and prints the results in a readable format.
    """
    try:
        chi2_result = st.chi2_contingency(np.array(contingency_table))
        statistic, p_value, dof, expected_freq = chi2_result

        print(f"Chi-squared Test for {data_name}:")
        print(f"  Chi-squared Statistic: {statistic:.2f}")
        print(f"  P-value: {p_value:.2e}")  # Scientific notation for small values
        print(f"  Degrees of Freedom: {dof}")
        # print("  Expected frequencies:\n", expected_freq)
        print("-" * 40) 
    except ValueError as e:
        print(f"Error in chi-square test for {data_name}: {e}")

# --- Run the Chi-squared tests and print results ---
# perform_and_print_chi2("Sugar", sugar_chi_df)

def analyze_feature_relation(df, index_feature, column_feature):
    """
    Creates a table based on specified features and performs a Chi-squared test.
    """
    pivot_df = df.pivot_table(
        index=index_feature,
        columns=column_feature,
        values='name',
        aggfunc='count',
        fill_value=0
    )
    print(f"Data Table for {column_feature}:\n", pivot_df, "\n")
    perform_and_print_chi2(column_feature, pivot_df)

feature_pairs = [
    ('Happiness_Level', 'Sugar'),
    ('Happiness_Level', 'Salt'),
    ('Happiness_Level', 'Healthy'),
    ('Happiness_Level', 'Processing_level')
]

# --- Run the analysis for each pair of features ---
for index_feature, column_feature in feature_pairs:
    analyze_feature_relation(happy_food, index_feature, column_feature)

# res_salt = st.chi2_contingency(np.array(salt_chi_df))

Data Table for Sugar:
 Sugar            0    1   2   3
Happiness_Level                
2                1    1   0   0
3                0    6   5   1
4                1  150  16  13
5                0    5   9   9 

Chi-squared Test for Sugar:
  Chi-squared Statistic: 105.14
  P-value: 1.43e-18
  Degrees of Freedom: 9
----------------------------------------
Data Table for Salt:
 Salt             0   1    2
Happiness_Level            
2                1   0    1
3                2   7    3
4                2  36  142
5                4  15    4 

Chi-squared Test for Salt:
  Chi-squared Statistic: 64.09
  P-value: 6.61e-12
  Degrees of Freedom: 6
----------------------------------------
Data Table for Healthy:
 Healthy           0    1   2
Happiness_Level             
2                 2    0   0
3                 2    5   5
4                32  106  42
5                 8    9   6 

Chi-squared Test for Healthy:
  Chi-squared Statistic: 14.37
  P-value: 2.58e-02
  Degrees of Freedom:

#### What Does This Mean?

Strong Evidence of Association: The extremely low p-value provides very strong evidence to reject the null hypothesis. This means there is a statistically significant association between Happiness, Sugar and Salt.
Happiness, Sugar and Salt Are NOT Independent:  data clearly suggests that the level of happiness is related to the level of sugar and salt. This isn't due to random chance.

Happiness, Healthy Are NOT Independent: data clearly suggests that the level of happiness is related to the level of Healthy, but not as strong as with salt.

Happiness, Processing_level Are NOT Independent:: probability of observing the p-value if there were no real association between happiness and processing level. A very small p-value is a strong indication against the null hypothesis i.e. highly significant association between happiness and processing level

In [96]:
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_food = df[df['Cuisine'].isin(cuisines)]
happy_food['Meal_Course'].value_counts()

Meal_Course
Main Course    140
Dessert         30
Drink           25
Snack           13
Starter          5
Side Dish        3
Breakfast        1
Name: count, dtype: int64

In [97]:
feature_pairs = [
    ('Meal_Course', 'Sugar'),
    ('Meal_Course', 'Salt'),
    ('Meal_Course', 'Healthy'),
    ('Meal_Course', 'Happiness_Level'),
    ('Meal_Course', 'Processing_level')
]

# --- Run the analysis for each pair of features ---
for index_feature, column_feature in feature_pairs:
    analyze_feature_relation(happy_food, index_feature, column_feature)

Data Table for Sugar:
 Sugar        0    1   2   3
Meal_Course                
Breakfast    0    1   0   0
Dessert      0    0   9  21
Drink        1    8  14   2
Main Course  0  136   4   0
Side Dish    1    2   0   0
Snack        0   10   3   0
Starter      0    5   0   0 

Chi-squared Test for Sugar:
  Chi-squared Statistic: 246.31
  P-value: 4.60e-42
  Degrees of Freedom: 18
----------------------------------------
Data Table for Salt:
 Salt         0   1    2
Meal_Course            
Breakfast    0   0    1
Dessert      3  27    0
Drink        6  19    0
Main Course  0   9  131
Side Dish    0   0    3
Snack        0   3   10
Starter      0   0    5 

Chi-squared Test for Salt:
  Chi-squared Statistic: 176.13
  P-value: 2.66e-31
  Degrees of Freedom: 12
----------------------------------------
Data Table for Healthy:
 Healthy       0   1   2
Meal_Course            
Breakfast     0   1   0
Dessert      21   4   5
Drink         4  11  10
Main Course  11  94  35
Side Dish     0   1   2

#### ANOVA test on happiness level for few cuisine

In [99]:
main_course_happiness = df[df['Meal_Course'] == 'Main Course']['Happiness_Level'].dropna()
dessert_happiness = df[df['Meal_Course'] == 'Dessert']['Happiness_Level'].dropna()
drink_happiness = df[df['Meal_Course'] == 'Drink']['Happiness_Level'].dropna()

f_statistic, p_value = st.f_oneway(
        main_course_happiness,
        dessert_happiness,
        drink_happiness
    )
print(f_statistic, p_value)
alpha = 0.05
if p_value < alpha:
    print(f"  Conclusion: Reject the null hypothesis with p value {p_value}. There is a significant difference in mean Happiness_Level across Cuisines.")
else:
    print(f"  Conclusion: Fail to reject the null hypothesis with p value {p_value}. There is no significant difference in mean Happiness_Level across Cuisines.")

24.49415354291875 1.7339749710226944e-10
  Conclusion: Reject the null hypothesis with p value 1.7339749710226944e-10. There is a significant difference in mean Happiness_Level across Cuisines.


### Ignore: Wrong relationship testing

#### H0: Happness is not related to processing level

In [68]:
st.ttest_rel(df['Happiness_Level'], df['Processing_level'])


TtestResult(statistic=np.float64(73.9427175522059), pvalue=np.float64(5.7395574138600655e-199), df=np.int64(310))

#### Happiness is related to processing level of food

#### H0 : Happiness is not related to sugar

In [69]:
df_datanotna = df[df['date'].notna()]
print(df_datanotna.sample(5))
st.ttest_rel(df_datanotna['Happiness_Level'], df_datanotna['Sugar'])


                                name     size       date  Cuisine  \
197  Food/PXL_20240719_173354236.jpg  1890695 2024-07-19  Italian   
231  Food/PXL_20221228_111532739.jpg  2805508 2022-12-28   Fusion   
237  Food/PXL_20250102_110121805.jpg  2203035 2025-01-02   Fusion   
256  Food/PXL_20240329_113229444.jpg  2202127 2024-03-29   Indian   
281  Food/PXL_20250209_133651581.jpg  2022228 2025-02-09  Chinese   

     Happiness_Level  Meal_Course  Sugar  Salt  Healthy  Processing_level  \
197                4  Main Course      1     2        0                 2   
231                4  Main Course      1     2        1                 1   
237                4  Main Course      1     2        1                 1   
256                4  Main Course      1     2        1                 1   
281                3        Drink      0     0        2                 1   

    Preparation_Method Dominant_Color  Food_Diversity image_month  
197              Baked          Brown               1 

TtestResult(statistic=np.float64(49.90009025434486), pvalue=np.float64(6.398083242902239e-116), df=np.int64(203))

In [70]:
def t_test_rel_features(s1,  features):
    """Test means of a feature set of two samples
    
    Args:
        s1 (dataframe): dataframe
        features (list): an array of features to test
    
    Returns:
        dict: a dictionary of t-test scores for each feature where the feature name is the key and the p-value is the value
    """
    results = {}
    for f1 in features:
        for f2 in features:
            if f1!=f2:
                t_stat, p_val = st.ttest_rel(s1[f1], s1[f2])
                f=f1+":"+f2
                results[f] = p_val
    
    return results

In [71]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_datanotna,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")


related features within dataset are Sugar:Salt with p-value 0.27967839352321594
related features within dataset are Salt:Sugar with p-value 0.27967839352321594


#### Lets test for only Meal_Course dishes

In [72]:
df_it = df[df['Meal_Course']=='Drink']

f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_it,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")

related features within dataset are Healthy:Processing_level with p-value 0.05138641776155715
related features within dataset are Processing_level:Healthy with p-value 0.05138641776155715


#### Is dessert related to sugar/salt/happiness

H0: Happiness in dessert is not related to Sugar

In [73]:
df_desse = df[df['Meal_Course']=='Dessert']

f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_desse,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")

In [74]:
df_desse.sample(5)

Unnamed: 0,name,size,date,Cuisine,Happiness_Level,Meal_Course,Sugar,Salt,Healthy,Processing_level,Preparation_Method,Dominant_Color,Food_Diversity,image_month
266,Food/PXL_20240201_092137435~2.jpg,1983375,2024-02-01,Italian,5,Dessert,3,1,0,2,Baked,Brown,1,2024-02
108,Food/image(59).jpg,3027369,NaT,Fusion,5,Dessert,2,1,2,1,Unknown,White,2,
27,Food/image(12).jpg,1959360,NaT,Unknown,5,Dessert,3,1,0,2,Unknown,Yellow,1,
62,Food/PXL_20241231_140113873.MP.jpg,4987029,2024-12-31,Indian,5,Dessert,3,0,1,1,Boiled,Orange,1,2024-12
305,Food/PXL_20230823_204211650.jpg,2180192,2023-08-23,Unknown,5,Dessert,3,1,0,2,Unknown,Pink,1,2023-08


#### Null hypothesis stays and there is no relation of Desserts with happiness/Sugar/Salt

In [75]:
fus_food = df[df['Cuisine']=='Fusion']
ind_food = df[df['Cuisine']=='Indian']
ity_food = df[df['Cuisine']=='Italian']
oth_food = df.loc[(df['Cuisine']=='Thai') | (df['Cuisine']=='German')| (df['Cuisine']=='American') | (df['Cuisine']=='Japanese')| (df['Cuisine']=='Mexican')]

In [76]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res = t_test_features(fus_food,ind_food,f_lst)
res


{'Happiness_Level': np.float64(0.20775180166121093),
 'Sugar': np.float64(0.34302454194036813),
 'Salt': np.float64(0.046241135549500705),
 'Healthy': np.float64(6.829220051709067e-05),
 'Processing_level': np.float64(0.6287771166261774)}

In [77]:
res = t_test_features(fus_food,ity_food,f_lst)
res

{'Happiness_Level': np.float64(0.3816913824253566),
 'Sugar': np.float64(0.21141473968454622),
 'Salt': np.float64(0.04632000103069966),
 'Healthy': np.float64(5.147404125830818e-05),
 'Processing_level': np.float64(0.0018592877484584418)}

In [78]:
res = t_test_features(fus_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.26484280989603937),
 'Sugar': np.float64(0.20565009452090766),
 'Salt': np.float64(0.13323977550167482),
 'Healthy': np.float64(3.8472512427164056e-05),
 'Processing_level': np.float64(0.003254062994258967)}

In [79]:
res = t_test_features(ind_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.4872656913564608),
 'Sugar': np.float64(0.006467791699839403),
 'Salt': np.float64(0.00017723709118319577),
 'Healthy': np.float64(0.06822082450518487),
 'Processing_level': np.float64(8.439293169212436e-06)}

In [80]:
res = t_test_features(ity_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.7271351368114876),
 'Sugar': np.float64(0.8906441005896278),
 'Salt': np.float64(0.8717053070054425),
 'Healthy': np.float64(0.6523014820402091),
 'Processing_level': np.float64(1.0)}

### Null hypothesis fails 
Italian, german, american, chineses, mesican foods are highly related with happiness levels, sugar and salt

### Fusion, Indian, Italian, Others are highly correlated with happiness and sugar

### 

In [81]:
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_food = df[df['Cuisine'].isin(cuisines)]
oth_food =df[~df['Cuisine'].isin(cuisines)]

res = t_test_features(happy_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.2604969033207468),
 'Sugar': np.float64(5.126423265455729e-05),
 'Salt': np.float64(6.389219444635156e-11),
 'Healthy': np.float64(0.24300084035847863),
 'Processing_level': np.float64(1.2168877989392894e-05)}

#### Other categories of food are only related to happiness and no other relation

In [82]:
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_drink = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Drink')]
happy_dessert = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Dessert')]
happy_food = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Main Course')]

In [83]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(happy_drink,f_lst)
for k,v in res_rel.items():
    if v>0.05:
        print(f"related features within dataset are {k} with p-value {v}")

related features within dataset are Sugar:Healthy with p-value 0.0694431909615676
related features within dataset are Sugar:Processing_level with p-value 0.05587675230714424
related features within dataset are Healthy:Sugar with p-value 0.0694431909615676
related features within dataset are Healthy:Processing_level with p-value 0.36375613199600754
related features within dataset are Processing_level:Sugar with p-value 0.05587675230714424
related features within dataset are Processing_level:Healthy with p-value 0.36375613199600754


#### All drink are related with Sugar, Healthy levels, Processing_level but not related to Happiness levels

In [84]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']
res_rel = t_test_rel_features(happy_dessert,f_lst)
for k,v in res_rel.items():
    if v>0.05:
        print(f"related features within dataset are {k} with p-value {v}")