In [5]:
import pandas as pd

# Load the uploaded Output.csv file
df = pd.read_csv("Output.csv")

# Select only the necessary columns and remove invalid values
df_filtered = df[
    (df["HW1"] <= 59) &
    (df["HW70"] < 9990) &
    (df["HW71"] < 9990) &
    (df["HW72"] < 9990)
].copy()

# Create malnutrition indicators
df_filtered["stunted"] = (df_filtered["HW70"] < -200).astype(int)
df_filtered["underweight"] = (df_filtered["HW71"] < -200).astype(int)
df_filtered["wasting"] = (df_filtered["HW72"] < -200).astype(int)
df_filtered["any_undernutrition"] = (
    (df_filtered["stunted"] == 1) |
    (df_filtered["underweight"] == 1) |
    (df_filtered["wasting"] == 1)
).astype(int)
df_filtered["all_undernutrition"] = (
    (df_filtered["stunted"] == 1) &
    (df_filtered["underweight"] == 1) &
    (df_filtered["wasting"] == 1)
).astype(int)

# Create parental education comparison
df_filtered["edu_diff"] = pd.cut(
    df_filtered["V701"] - df_filtered["V106"],
    bins=[-float("inf"), -1, 0, float("inf")],
    labels=["Father < Mother", "Father = Mother", "Father > Mother"]
)

# Add residence label
df_filtered["residence"] = df_filtered["V025"].map({1: "Urban", 2: "Rural"})

# Group and summarize
summary = df_filtered.groupby(["residence", "edu_diff"]).agg(
    stunting_rate=("stunted", lambda x: round(x.mean() * 100, 1)),
    underweight_rate=("underweight", lambda x: round(x.mean() * 100, 1)),
    wasting_rate=("wasting", lambda x: round(x.mean() * 100, 1)),
    any_undernutrition_rate=("any_undernutrition", lambda x: round(x.mean() * 100, 1)),
    all_undernutrition_rate=("all_undernutrition", lambda x: round(x.mean() * 100, 1)),
    n=("stunted", "count")
).reset_index()

# Add totals for each residence group
totals = df_filtered.groupby("residence").agg(
    stunting_rate=("stunted", lambda x: round(x.mean() * 100, 1)),
    underweight_rate=("underweight", lambda x: round(x.mean() * 100, 1)),
    wasting_rate=("wasting", lambda x: round(x.mean() * 100, 1)),
    any_undernutrition_rate=("any_undernutrition", lambda x: round(x.mean() * 100, 1)),
    all_undernutrition_rate=("all_undernutrition", lambda x: round(x.mean() * 100, 1)),
    n=("stunted", "count")
).reset_index()

totals["edu_diff"] = "Total"
final_summary = pd.concat([summary, totals], ignore_index=True)
# To view the table
print(final_summary)


  residence         edu_diff  stunting_rate  underweight_rate  wasting_rate  \
0     Rural  Father < Mother           19.3               6.0          10.7   
1     Rural  Father = Mother           20.3               6.6          10.1   
2     Rural  Father > Mother           21.6               5.0           8.2   
3     Urban  Father < Mother           19.1               6.6          10.3   
4     Urban  Father = Mother           18.2               6.9          12.3   
5     Urban  Father > Mother           19.9               7.6          12.2   
6     Rural            Total           20.5               6.1           9.6   
7     Urban            Total           18.7               6.9          11.9   

   any_undernutrition_rate  all_undernutrition_rate     n  
0                     29.4                      0.9   879  
1                     29.9                      0.9  3227  
2                     29.4                      0.5  1542  
3                     28.8                      

  summary = df_filtered.groupby(["residence", "edu_diff"]).agg(


In [3]:

# Recode father's occupation
df_filtered["Father_occupation"] = pd.cut(
    df_filtered["V704"],
    bins=[0, 54, 96],
    labels=["Service/Business", "Agriculture/Others"]
)

# Recode mother's working status
df_filtered["Mother_working_status"] = df_filtered["V731"].map({1: "Working", 0: "Not working"})

# Recode residence
df_filtered["residence"] = df_filtered["V025"].map({1: "Urban", 2: "Rural"})

# Group and calculate percentages
summary = df_filtered.groupby(["residence", "Father_occupation", "Mother_working_status"]).agg(
    Stunted=("stunted", lambda x: round(x.mean() * 100, 1)),
    Underweight=("underweight", lambda x: round(x.mean() * 100, 1)),
    Wasting=("wasting", lambda x: round(x.mean() * 100, 1)),
    Any_undernutrition=("any_undernutrition", lambda x: round(x.mean() * 100, 1)),
    All_undernutrition=("all_undernutrition", lambda x: round(x.mean() * 100, 1)),
    N=("stunted", "count")
).reset_index()

print(summary)


  residence   Father_occupation Mother_working_status  Stunted  Underweight  \
0     Rural    Service/Business           Not working     18.3          5.2   
1     Rural    Service/Business               Working     18.2          9.1   
2     Rural  Agriculture/Others           Not working     21.4          6.5   
3     Rural  Agriculture/Others               Working     26.3          5.3   
4     Urban    Service/Business           Not working     18.0          7.2   
5     Urban    Service/Business               Working     12.5         12.5   
6     Urban  Agriculture/Others           Not working     19.7          7.4   
7     Urban  Agriculture/Others               Working     15.4          7.7   

   Wasting  Any_undernutrition  All_undernutrition     N  
0      8.6                26.7                 0.6  1716  
1     18.2                36.4                 0.0    11  
2     10.6                31.4                 0.9  3213  
3      5.3                31.6                 0.0  

  summary = df_filtered.groupby(["residence", "Father_occupation", "Mother_working_status"]).agg(


In [4]:

# Recode maternal education
df_filtered["Maternal_education"] = pd.cut(
    df_filtered["V106"],
    bins=[-1, 0, 1, 3],
    labels=["No education", "Primary", "Secondary and above"]
)

# Recode mother's working status (V714)
df_filtered["Mother_working_status"] = df_filtered["V714"].map({1: "Working", 0: "Not working"})

# Recode residence
df_filtered["residence"] = df_filtered["V025"].map({1: "Urban", 2: "Rural"})

# Group and summarize the results
summary_table = df_filtered.groupby(["residence", "Maternal_education", "Mother_working_status"]).agg(
    Stunted=("stunted", lambda x: round(x.mean() * 100, 1)),
    Underweight=("underweight", lambda x: round(x.mean() * 100, 1)),
    Wasting=("wasting", lambda x: round(x.mean() * 100, 1)),
    Any_Undernutrition=("any_undernutrition", lambda x: round(x.mean() * 100, 1)),
    All_Undernutrition=("all_undernutrition", lambda x: round(x.mean() * 100, 1)),
    N=("stunted", "count")
).reset_index()

print(summary_table)

   residence   Maternal_education Mother_working_status  Stunted  Underweight  \
0      Rural         No education           Not working     24.2          6.8   
1      Rural         No education               Working     28.2          8.3   
2      Rural              Primary           Not working     24.1          8.6   
3      Rural              Primary               Working     34.5          3.6   
4      Rural  Secondary and above           Not working     18.0          5.3   
5      Rural  Secondary and above               Working     17.6          5.6   
6      Urban         No education           Not working     21.3          7.9   
7      Urban         No education               Working     32.3          3.2   
8      Urban              Primary           Not working     21.3          8.9   
9      Urban              Primary               Working     18.2          0.0   
10     Urban  Secondary and above           Not working     18.1          7.1   
11     Urban  Secondary and 

  summary_table = df_filtered.groupby(["residence", "Maternal_education", "Mother_working_status"]).agg(


In [6]:
# Recode father's education
df_filtered["Father_education"] = pd.cut(
    df_filtered["V701"],
    bins=[-1, 0, 1, 3],
    labels=["No education", "Primary", "Secondary and above"]
)

# Recode father's occupation
df_filtered["Father_occupation"] = pd.cut(
    df_filtered["V704"],
    bins=[0, 54, 96],
    labels=["Service/Business", "Agriculture/Others"]
)

# Recode residence
df_filtered["residence"] = df_filtered["V025"].map({1: "Urban", 2: "Rural"})

# Group and summarize the data
summary_table = df_filtered.groupby(["residence", "Father_education", "Father_occupation"]).agg(
    Stunted=("stunted", lambda x: round(x.mean() * 100, 1)),
    Underweight=("underweight", lambda x: round(x.mean() * 100, 1)),
    Wasting=("wasting", lambda x: round(x.mean() * 100, 1)),
    Any_Undernutrition=("any_undernutrition", lambda x: round(x.mean() * 100, 1)),
    All_Undernutrition=("all_undernutrition", lambda x: round(x.mean() * 100, 1)),
    N=("stunted", "count")
).reset_index()

print(summary_table)

   residence     Father_education   Father_occupation  Stunted  Underweight  \
0      Rural         No education    Service/Business     23.6         12.6   
1      Rural         No education  Agriculture/Others     25.3          7.1   
2      Rural              Primary    Service/Business     24.3          7.2   
3      Rural              Primary  Agriculture/Others     22.3          7.4   
4      Rural  Secondary and above    Service/Business     17.7          4.8   
5      Rural  Secondary and above  Agriculture/Others     19.9          5.8   
6      Urban         No education    Service/Business     20.6          8.8   
7      Urban         No education  Agriculture/Others     19.8          6.3   
8      Urban              Primary    Service/Business     14.1          7.8   
9      Urban              Primary  Agriculture/Others     23.0          8.7   
10     Urban  Secondary and above    Service/Business     17.8          6.5   
11     Urban  Secondary and above  Agriculture/Other

  summary_table = df_filtered.groupby(["residence", "Father_education", "Father_occupation"]).agg(


In [6]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit


# Filter and create outcome indicators
probit_data = df[
    (df["HW1"] <= 59) &
    (df["HW70"] < 9990) & (df["HW71"] < 9990) & (df["HW72"] < 9990) &
    (df["V106"].between(0, 3)) & (df["V701"].between(0, 3)) &
    (df["V714"].isin([0, 1])) & (df["V704"].between(1, 96)) &
    (df["V025"].isin([1, 2]))
].copy()

# Recode residence
probit_data["residence"] = probit_data["V025"].map({1: "Urban", 2: "Rural"})

# Outcome indicators
probit_data["stunted"] = (probit_data["HW70"] < -200).astype(int)
probit_data["underweight"] = (probit_data["HW71"] < -200).astype(int)
probit_data["wasting"] = (probit_data["HW72"] < -200).astype(int)
probit_data["any_undernutrition"] = (
    probit_data["stunted"] | probit_data["underweight"] | probit_data["wasting"]
).astype(int)

# Add recoded media access
probit_data["media_access"] = (
    ((probit_data["V157"].between(1, 3)) |
     (probit_data["V158"].between(1, 3)) |
     (probit_data["V159"].between(1, 3)))
).astype(int)

# Add BMI placeholder
probit_data["BMI"] = probit_data.get("Mother_BMI", pd.NA)

# Define variables
outcomes = ["stunted", "underweight", "wasting", "any_undernutrition"]
parental_vars = ["V106", "V701", "V714", "V704"]
control_vars = ["media_access", "V012", "V101", "V190", "V511", "V212", "HW1", "BORD", "V218", "B11"]

# Run models and store results
all_results = []
for outcome in outcomes:
    for model in ["Model 1", "Model 2", "Model 3"]:
        for res in ["Urban", "Rural"]:
            df_sub = probit_data[probit_data["residence"] == res].copy()
            y = df_sub[outcome]

            # Choose predictors based on model
            if model == "Model 1":
                for var in parental_vars:
                    X = sm.add_constant(df_sub[[var]])
                    try:
                        probit = Probit(y, X).fit(disp=False)
                        coef = round(probit.params[var], 3)
                    except:
                        coef = None
                    all_results.append([outcome, var, model, res, coef])
            elif model == "Model 2":
                X = sm.add_constant(df_sub[parental_vars])
                try:
                    probit = Probit(y, X).fit(disp=False)
                    for var in parental_vars:
                        coef = round(probit.params[var], 3)
                        all_results.append([outcome, var, model, res, coef])
                except:
                    for var in parental_vars:
                        all_results.append([outcome, var, model, res, None])
            elif model == "Model 3":
                model3_vars = parental_vars + control_vars
                available = [v for v in model3_vars if v in df_sub.columns and df_sub[v].notna().sum() > 0]
                X = sm.add_constant(df_sub[available])
                try:
                    probit = Probit(y, X).fit(disp=False)
                    for var in parental_vars:
                        if var in probit.params:
                            coef = round(probit.params[var], 3)
                        else:
                            coef = None
                        all_results.append([outcome, var, model, res, coef])
                except:
                    for var in parental_vars:
                        all_results.append([outcome, var, model, res, None])

# Convert results to DataFrame
flat_results_df = pd.DataFrame(all_results, columns=["Outcome", "Variable", "Model", "Residence", "Coefficient"])

# Pivot into table format like Table 7
final_table = flat_results_df.pivot_table(
    index=["Outcome", "Variable"],
    columns=["Model", "Residence"],
    values="Coefficient",
    aggfunc="first"
)[[
    ("Model 1", "Urban"), ("Model 1", "Rural"),
    ("Model 2", "Urban"), ("Model 2", "Rural"),
    ("Model 3", "Urban"), ("Model 3", "Rural")
]]

# Rename columns
final_table.columns = ['M1_Urban', 'M1_Rural', 'M2_Urban', 'M2_Rural', 'M3_Urban', 'M3_Rural']
final_table = final_table.reset_index()

# Optional: rename variables for display
variable_labels = {
    "V106": "Mother's education",
    "V701": "Father's education",
    "V714": "Mother's working status",
    "V704": "Father's occupation"
}
final_table["Variable"] = final_table["Variable"].map(variable_labels)

# Display or export
print(final_table)
# final_table.to_csv("probit_results_table7_style.csv", index=False)


               Outcome                 Variable  M1_Urban  M1_Rural  M2_Urban  \
0   any_undernutrition       Mother's education    -0.039    -0.079    -0.029   
1   any_undernutrition       Father's education    -0.028    -0.079    -0.002   
2   any_undernutrition      Father's occupation     0.001     0.002     0.001   
3   any_undernutrition  Mother's working status    -0.041    -0.031    -0.018   
4              stunted       Mother's education    -0.031    -0.108    -0.019   
5              stunted       Father's education    -0.026    -0.086    -0.004   
6              stunted      Father's occupation     0.001     0.003     0.001   
7              stunted  Mother's working status    -0.026     0.034    -0.005   
8          underweight       Mother's education    -0.040    -0.056    -0.013   
9          underweight       Father's education    -0.027    -0.098     0.011   
10         underweight      Father's occupation     0.002     0.001     0.002   
11         underweight  Moth