### Define hypothese types

independent samples t-test (ttest_ind)
compare the **means of two different groups. The groups should not be related
if p<0.05 means are significantly different.

### Paired t-test : two related groups — like before and after (ttest_rel) within same population
Test scores before and after a experiment
if p < 0.05, the change is statistically significant.



In [135]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import scipy.stats as st
import numpy as np


In [136]:
def read_jsonl_to_dataframe(filepath):
    """ Reads a JSON Lines file into a Pandas DataFrame."""
    try:
        df = pd.read_json(filepath, lines=True, convert_dates=False)
        return df
    except Exception as e:
        print(f"Error reading jsonl to dataframe: {e}")
        return None

In [137]:
food_data = "llm_output.json"

df = read_jsonl_to_dataframe(food_data)
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['image_month'] = df['date'].dt.strftime('%Y-%m')

# 2 = Healthy, 1 = Balanced, 0 = 'Unhealthy'
health_value_map = {
    'healthy':2,
    'Balanced':1,
    'Unhealthy':0,
    'Unknown':1,
    'No':0,
    'balanced':1,
    'Yes':2,
    'unhealthy':0,
    'Healthy':2
}

gen_map = {
    'No':0,
    'Low':1,
    'Medium':2,
    'High':3
}

processing_map = {
    'Minimally Processed':1,
    'Processed':2,
    'Unprocessed':0
}


df_orig = df.copy()
df['Healthy'] = df['Healthy'].map(health_value_map)
df['Salt'] = df['Salt'].map(gen_map)
df['Sugar'] = df['Sugar'].map(gen_map)
df['Food_Diversity'] = df['Food_Diversity'].map(gen_map)
df['Processing_level']= df['Processing_level'].map(processing_map)

df.sample(5)

Unnamed: 0,name,size,date,Cuisine,Happiness_Level,Meal_Course,Sugar,Salt,Healthy,Processing_level,Preparation_Method,Dominant_Color,Food_Diversity,image_month
250,Food/PXL_20240108_162844962.jpg,3155193,2024-01-08,Unknown,3,Snack,1,1,1,1,Roasted,Brown,1,2024-01
162,Food/image(54).jpg,1152156,NaT,Unknown,3,Snack,2,1,0,2,Baked,Brown,1,
91,Food/image(109).jpg,3172340,NaT,Fusion,4,Main Course,1,2,1,1,Baked,Red,3,
3,Food/image(1).jpg,3282700,NaT,Unknown,4,Dessert,3,1,0,2,Baked,Brown,1,
213,Food/image(60).jpg,1913004,NaT,Unknown,5,Dessert,3,1,0,2,Baked,Yellow,1,


In [138]:
df_n = df[['Happiness_Level', 'Sugar', 'Salt', 'Healthy','Processing_level', 'Food_Diversity']]
df_n.corr()

Unnamed: 0,Happiness_Level,Sugar,Salt,Healthy,Processing_level,Food_Diversity
Happiness_Level,1.0,0.290237,-0.037341,-0.029132,0.108371,0.093414
Sugar,0.290237,1.0,-0.55117,-0.514042,0.664263,-0.306616
Salt,-0.037341,-0.55117,1.0,0.096176,-0.331087,0.432985
Healthy,-0.029132,-0.514042,0.096176,1.0,-0.618172,0.260545
Processing_level,0.108371,0.664263,-0.331087,-0.618172,1.0,-0.30299
Food_Diversity,0.093414,-0.306616,0.432985,0.260545,-0.30299,1.0


In [139]:
def t_test_features(s1, s2, features):
    """Test means of a feature set of two samples
    
    Args:
        s1 (dataframe): sample 1
        s2 (dataframe): sample 2
        features (list): an array of features to test
    
    Returns:
        dict: a dictionary of t-test scores for each feature where the feature name is the key and the p-value is the value
    """
    results = {}
    for f in features:
        t_stat, p_val = st.ttest_ind(s1[f], s2[f])
        results[f] = p_val

    return results

### Hypothesis: Cusines vs. rest
H0 : Cuisine is not related to healthy choices

H0: Cusines is not related to happiness

In [140]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

food_div3 = df[df['Food_Diversity']==3]
food_div1 = df[df['Food_Diversity']==1]
food_crse_fusion = df[df['Cuisine']=='Fusion']
food_crse_indian = df[df['Cuisine']=='Indian']
food_crse_Italian = df[df['Cuisine']=='Italian']

res = t_test_features(food_div3,food_div1,f_lst)
res


{'Happiness_Level': np.float64(0.17760715468273314),
 'Sugar': np.float64(9.99017229978088e-06),
 'Salt': np.float64(6.267788895285081e-10),
 'Healthy': np.float64(1.0319079422408495e-05),
 'Processing_level': np.float64(2.1141829263598602e-05)}

#### Happiness level is invariable in diversity raw processed and high processed food, mostly same

(Sugar/Salt/Healthy/Processing level) are different between raw processed and high processed food

### Summary : food diversity share is similar to happiness level and mostly related

In [141]:
res = t_test_features(food_crse_fusion,food_crse_indian,f_lst)
res

{'Happiness_Level': np.float64(0.20775180166121093),
 'Sugar': np.float64(0.34302454194036813),
 'Salt': np.float64(0.046241135549500705),
 'Healthy': np.float64(6.829220051709067e-05),
 'Processing_level': np.float64(0.6287771166261774)}

#### Cusine type effects heatly level

# chi square and chi contingency test

### Data Prep

In [142]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_food = df[df['Cuisine'].isin(cuisines)]

### Validate relationships

In [143]:
def perform_and_print_chi2(data_name, contingency_table):
    """
    Performs a Chi-squared test and prints the results in a readable format.

    Args:
      data_name (str): A descriptive name for the data 
      contingency_table (array-like): The contingency table data for the Chi-squared test.
    """
    try:
        chi2_result = st.chi2_contingency(np.array(contingency_table))
        statistic, p_value, dof, expected_freq = chi2_result

        print(f"Chi-squared Test for {data_name}:")
        print(f"  Chi-squared Statistic: {statistic:.2f}")
        print(f"  P-value: {p_value:.2e}")  # Scientific notation for small values
        print(f"  Degrees of Freedom: {dof}")
        # print("  Expected frequencies:\n", expected_freq)
        print("-" * 40)  # Separator for readability
    except ValueError as e:
        print(f"Error in chi-square test for {data_name}: {e}")

# --- Run the Chi-squared tests and print results ---
# perform_and_print_chi2("Sugar", sugar_chi_df)

def analyze_feature_relation(df, index_feature, column_feature):
    """
    Creates a table based on specified features and performs a Chi-squared test.
    """
    pivot_df = df.pivot_table(
        index=index_feature,
        columns=column_feature,
        values='name',
        aggfunc='count',
        fill_value=0
    )
    # print(f"Data Table for {column_feature}:\n", pivot_df, "\n")
    perform_and_print_chi2(column_feature, pivot_df)

feature_pairs = [
    ('Happiness_Level', 'Sugar'),
    ('Happiness_Level', 'Salt'),
    ('Happiness_Level', 'Healthy'),
    ('Happiness_Level', 'Processing_level')
]

# --- Run the analysis for each pair of features ---
for index_feature, column_feature in feature_pairs:
    analyze_feature_relation(happy_food, index_feature, column_feature)

# res_salt = st.chi2_contingency(np.array(salt_chi_df))

Chi-squared Test for Sugar:
  Chi-squared Statistic: 105.14
  P-value: 1.43e-18
  Degrees of Freedom: 9
----------------------------------------
Chi-squared Test for Salt:
  Chi-squared Statistic: 64.09
  P-value: 6.61e-12
  Degrees of Freedom: 6
----------------------------------------
Chi-squared Test for Healthy:
  Chi-squared Statistic: 14.37
  P-value: 2.58e-02
  Degrees of Freedom: 6
----------------------------------------
Chi-squared Test for Processing_level:
  Chi-squared Statistic: 128.16
  P-value: 3.14e-25
  Degrees of Freedom: 6
----------------------------------------


#### What Does This Mean?

Strong Evidence of Association: The extremely low p-value provides very strong evidence to reject the null hypothesis. This means there is a statistically significant association between Happiness, Sugar and Salt.
Happiness, Sugar and Salt Are NOT Independent:  data clearly suggests that the level of happiness is related to the level of sugar and salt. This isn't due to random chance.

Happiness, Healthy Are NOT Independent: data clearly suggests that the level of happiness is related to the level of Healthy, but not as strong as with salt.

Happiness, Processing_level Are NOT Independent:: probability of observing the p-value if there were no real association between happiness and processing level. A very small p-value is a strong indication against the null hypothesis i.e. highly significant association between happiness and processing level

In [144]:
feature_pairs = [
    ('Meal_Course', 'Sugar'),
    ('Meal_Course', 'Salt'),
    ('Meal_Course', 'Healthy'),
    ('Meal_Course', 'Happiness_Level'),
    ('Meal_Course', 'Processing_level')
]

# --- Run the analysis for each pair of features ---
for index_feature, column_feature in feature_pairs:
    analyze_feature_relation(happy_food, index_feature, column_feature)

Chi-squared Test for Sugar:
  Chi-squared Statistic: 246.31
  P-value: 4.60e-42
  Degrees of Freedom: 18
----------------------------------------
Chi-squared Test for Salt:
  Chi-squared Statistic: 176.13
  P-value: 2.66e-31
  Degrees of Freedom: 12
----------------------------------------
Chi-squared Test for Healthy:
  Chi-squared Statistic: 76.93
  P-value: 1.58e-11
  Degrees of Freedom: 12
----------------------------------------
Chi-squared Test for Happiness_Level:
  Chi-squared Statistic: 127.53
  P-value: 1.57e-18
  Degrees of Freedom: 18
----------------------------------------
Chi-squared Test for Processing_level:
  Chi-squared Statistic: 76.83
  P-value: 1.65e-11
  Degrees of Freedom: 12
----------------------------------------


##

### Ignore: Wrong relationship testing

#### H0: Happness is not related to processing level

In [145]:
st.ttest_rel(df['Happiness_Level'], df['Processing_level'])


TtestResult(statistic=np.float64(73.9427175522059), pvalue=np.float64(5.7395574138600655e-199), df=np.int64(310))

#### Happiness is related to processing level of food

#### H0 : Happiness is not related to sugar

In [146]:
df_datanotna = df[df['date'].notna()]
print(df_datanotna.sample(5))
st.ttest_rel(df_datanotna['Happiness_Level'], df_datanotna['Sugar'])


                                name     size       date  Cuisine  \
135  Food/PXL_20240302_114131464.jpg  2776312 2024-03-02  Italian   
43   Food/PXL_20240218_131135420.jpg  2230451 2024-02-18   Indian   
233     Food/IMG-20220427-WA0000.jpg   375020 2022-04-27   Fusion   
114  Food/PXL_20240202_133657537.jpg  2494949 2024-02-02  Italian   
215     Food/IMG-20211219-WA0000.jpg   290727 2021-12-19  Mexican   

     Happiness_Level  Meal_Course  Sugar  Salt  Healthy  Processing_level  \
135                4  Main Course      1     2        1                 1   
43                 4  Main Course      1     2        2                 1   
233                4  Main Course      1     2        2                 1   
114                3        Drink      1     1        2                 1   
215                3  Main Course      1     2        1                 1   

    Preparation_Method Dominant_Color  Food_Diversity image_month  
135              Baked          Green               2 

TtestResult(statistic=np.float64(49.90009025434486), pvalue=np.float64(6.398083242902239e-116), df=np.int64(203))

In [147]:
def t_test_rel_features(s1,  features):
    """Test means of a feature set of two samples
    
    Args:
        s1 (dataframe): dataframe
        features (list): an array of features to test
    
    Returns:
        dict: a dictionary of t-test scores for each feature where the feature name is the key and the p-value is the value
    """
    results = {}
    for f1 in features:
        for f2 in features:
            if f1!=f2:
                t_stat, p_val = st.ttest_rel(s1[f1], s1[f2])
                f=f1+":"+f2
                results[f] = p_val
    
    return results

In [148]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_datanotna,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")


related features within dataset are Sugar:Salt with p-value 0.27967839352321594
related features within dataset are Salt:Sugar with p-value 0.27967839352321594


#### Lets test for only Meal_Course dishes

In [149]:
df_it = df[df['Meal_Course']=='Drink']

f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_it,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")

related features within dataset are Healthy:Processing_level with p-value 0.05138641776155715
related features within dataset are Processing_level:Healthy with p-value 0.05138641776155715


#### Is dessert related to sugar/salt/happiness

H0: Happiness in dessert is not related to Sugar

In [150]:
df_desse = df[df['Meal_Course']=='Dessert']

f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(df_desse,f_lst)

for k,v in res_rel.items():
    if v>0.04:
        print(f"related features within dataset are {k} with p-value {v}")

In [151]:
df_desse.sample(5)

Unnamed: 0,name,size,date,Cuisine,Happiness_Level,Meal_Course,Sugar,Salt,Healthy,Processing_level,Preparation_Method,Dominant_Color,Food_Diversity,image_month
186,Food/PXL_20240206_184112283.jpg,2650715,2024-02-06,Indian,5,Dessert,2,1,0,2,Baked,Yellow,1,2024-02
67,Food/PXL_20241002_091026266.jpg,2111803,2024-10-02,Indian,4,Dessert,3,1,0,2,Baked,Brown,3,2024-10
3,Food/image(1).jpg,3282700,NaT,Unknown,4,Dessert,3,1,0,2,Baked,Brown,1,
85,Food/image(43).jpg,2917973,NaT,Unknown,3,Dessert,3,1,0,2,Unknown,Yellow,1,
164,Food/PXL_20240110_114704871.jpg,3171499,2024-01-10,Unknown,4,Dessert,2,1,0,2,Unknown,Purple,1,2024-01


#### Null hypothesis stays and there is no relation of Desserts with happiness/Sugar/Salt

In [152]:
fus_food = df[df['Cuisine']=='Fusion']
ind_food = df[df['Cuisine']=='Indian']
ity_food = df[df['Cuisine']=='Italian']
oth_food = df.loc[(df['Cuisine']=='Thai') | (df['Cuisine']=='German')| (df['Cuisine']=='American') | (df['Cuisine']=='Japanese')| (df['Cuisine']=='Mexican')]

In [153]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res = t_test_features(fus_food,ind_food,f_lst)
res


{'Happiness_Level': np.float64(0.20775180166121093),
 'Sugar': np.float64(0.34302454194036813),
 'Salt': np.float64(0.046241135549500705),
 'Healthy': np.float64(6.829220051709067e-05),
 'Processing_level': np.float64(0.6287771166261774)}

In [154]:
res = t_test_features(fus_food,ity_food,f_lst)
res

{'Happiness_Level': np.float64(0.3816913824253566),
 'Sugar': np.float64(0.21141473968454622),
 'Salt': np.float64(0.04632000103069966),
 'Healthy': np.float64(5.147404125830818e-05),
 'Processing_level': np.float64(0.0018592877484584418)}

In [155]:
res = t_test_features(fus_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.26484280989603937),
 'Sugar': np.float64(0.20565009452090766),
 'Salt': np.float64(0.13323977550167482),
 'Healthy': np.float64(3.8472512427164056e-05),
 'Processing_level': np.float64(0.003254062994258967)}

In [156]:
res = t_test_features(ind_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.4872656913564608),
 'Sugar': np.float64(0.006467791699839403),
 'Salt': np.float64(0.00017723709118319577),
 'Healthy': np.float64(0.06822082450518487),
 'Processing_level': np.float64(8.439293169212436e-06)}

In [157]:
res = t_test_features(ity_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.7271351368114876),
 'Sugar': np.float64(0.8906441005896278),
 'Salt': np.float64(0.8717053070054425),
 'Healthy': np.float64(0.6523014820402091),
 'Processing_level': np.float64(1.0)}

### Null hypothesis fails 
Italian, german, american, chineses, mesican foods are highly related with happiness levels, sugar and salt

### Fusion, Indian, Italian, Others are highly correlated with happiness and sugar

### 

In [158]:
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_food = df[df['Cuisine'].isin(cuisines)]
oth_food =df[~df['Cuisine'].isin(cuisines)]

res = t_test_features(happy_food,oth_food,f_lst)
res

{'Happiness_Level': np.float64(0.2604969033207468),
 'Sugar': np.float64(5.126423265455729e-05),
 'Salt': np.float64(6.389219444635156e-11),
 'Healthy': np.float64(0.24300084035847863),
 'Processing_level': np.float64(1.2168877989392894e-05)}

#### Other categories of food are only related to happiness and no other relation

In [159]:
cuisines = ['Fusion', 'Indian', 'Italian', 'Thai', 'German', 'American', 'Japanese', 'Mexican']
happy_drink = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Drink')]
happy_dessert = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Dessert')]
happy_food = df[df['Cuisine'].isin(cuisines) & (df['Meal_Course']=='Main Course')]

In [160]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']

res_rel = t_test_rel_features(happy_drink,f_lst)
for k,v in res_rel.items():
    if v>0.05:
        print(f"related features within dataset are {k} with p-value {v}")

related features within dataset are Sugar:Healthy with p-value 0.0694431909615676
related features within dataset are Sugar:Processing_level with p-value 0.05587675230714424
related features within dataset are Healthy:Sugar with p-value 0.0694431909615676
related features within dataset are Healthy:Processing_level with p-value 0.36375613199600754
related features within dataset are Processing_level:Sugar with p-value 0.05587675230714424
related features within dataset are Processing_level:Healthy with p-value 0.36375613199600754


#### All drink are related with Sugar, Healthy levels, Processing_level but not related to Happiness levels

In [161]:
f_lst = ['Happiness_Level', 'Sugar', 'Salt', 'Healthy', 'Processing_level']
res_rel = t_test_rel_features(happy_dessert,f_lst)
for k,v in res_rel.items():
    if v>0.05:
        print(f"related features within dataset are {k} with p-value {v}")