
# Step 2 – Data Cleaning & Variable Construction  
Swiss Household Panel (2019–2023)

**Input:** analysis_dataset_step1_panel_2019_2023.csv  
**Output:** analysis_dataset_step2_clean.csv


In [1]:

import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 200)


In [2]:

df = pd.read_csv("analysis_dataset_step1_panel_2019_2023.csv")
print("Loaded shape:", df.shape)
df.head()


Loaded shape: (90910, 13)


Unnamed: 0,idpers,idhous,year,wave,age,sex,edyear,isced,sport_raw,health_raw,income_imputed,nbpers,nbkid
0,5101,51,2019,21,58.0,1.0,19.0,51.0,2.0,2.0,117000.0,,
1,5103,52,2019,21,27.0,1.0,9.0,20.0,,,,,
2,5104,51,2019,21,58.0,2.0,12.0,32.0,,,,,
3,5201,52,2019,21,25.0,2.0,12.0,32.0,,,,,
4,13101,131,2019,21,47.0,1.0,12.0,32.0,,,,,


In [3]:

df = df[(df["age"] >= 18) & (df["age"] <= 64)].copy()
print("After age restriction:", df.shape)


After age restriction: (55624, 13)


In [4]:

df["sport_active"] = np.where(df["sport_raw"] >= 1, 1, 0)

df["sport_level"] = pd.cut(
    df["sport_raw"],
    bins=[-0.1, 0, 2, 4, 7],
    labels=["none", "low", "medium", "high"]
)
df[["sport_raw", "sport_active", "sport_level"]].value_counts().head()


sport_raw  sport_active  sport_level
2.0        1             low            7894
3.0        1             medium         7626
1.0        1             low            4411
4.0        1             medium         4069
7.0        1             high           3594
Name: count, dtype: int64

In [5]:

df["log_income"] = np.where(df["income_imputed"] > 0,
                            np.log(df["income_imputed"]),
                            np.nan)
df[["income_imputed", "log_income"]].describe()


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,income_imputed,log_income
count,41298.0,40064.0
mean,80458.77,10.911786
std,140029.0,1.028231
min,0.0,4.60517
25%,33300.0,10.513253
50%,64700.0,11.086747
75%,98400.0,11.512925
max,14926010.0,16.518616


In [6]:

df["health_good"] = np.where(df["health_raw"] <= 2, 1, 0)
df[["health_raw", "health_good"]].value_counts(dropna=False)


health_raw  health_good
2.0         1              25473
NaN         0              14444
1.0         1               9696
3.0         0               5258
4.0         0                668
5.0         0                 85
Name: count, dtype: int64

In [7]:

df["female"] = np.where(df["sex"] == 2, 1, 0)
df["education_years"] = df["edyear"]
df["household_size"] = df["nbpers"]
df["num_children"] = df["nbkid"]
df[["female", "education_years", "household_size", "num_children"]].describe()


Unnamed: 0,female,education_years,household_size,num_children
count,55624.0,54931.0,0.0,0.0
mean,0.512189,14.405818,,
std,0.499856,3.582757,,
min,0.0,0.0,,
25%,0.0,12.0,,
50%,1.0,13.0,,
75%,1.0,19.0,,
max,1.0,21.0,,


In [8]:

df.isna().mean().sort_values(ascending=False)


nbpers             1.000000
household_size     1.000000
num_children       1.000000
nbkid              1.000000
sport_raw          0.416025
sport_level        0.416025
log_income         0.279735
health_raw         0.259672
income_imputed     0.257551
edyear             0.012459
education_years    0.012459
isced              0.012459
sex                0.000072
age                0.000000
wave               0.000000
year               0.000000
idhous             0.000000
idpers             0.000000
sport_active       0.000000
health_good        0.000000
female             0.000000
dtype: float64

In [9]:

analysis_vars = [
    "idpers","idhous","year","wave",
    "age","female",
    "education_years","isced",
    "sport_raw","sport_active","sport_level",
    "health_raw","health_good",
    "income_imputed","log_income",
    "household_size","num_children"
]

df_final = df[analysis_vars].copy()
print("Final dataset shape:", df_final.shape)
df_final.head()


Final dataset shape: (55624, 17)


Unnamed: 0,idpers,idhous,year,wave,age,female,education_years,isced,sport_raw,sport_active,sport_level,health_raw,health_good,income_imputed,log_income,household_size,num_children
0,5101,51,2019,21,58.0,0,19.0,51.0,2.0,1,low,2.0,1,117000.0,11.669929,,
1,5103,52,2019,21,27.0,0,9.0,20.0,,0,,,0,,,,
2,5104,51,2019,21,58.0,1,12.0,32.0,,0,,,0,,,,
3,5201,52,2019,21,25.0,1,12.0,32.0,,0,,,0,,,,
4,13101,131,2019,21,47.0,0,12.0,32.0,,0,,,0,,,,


In [10]:

OUTPUT = "analysis_dataset_step2_clean.csv"
df_final.to_csv(OUTPUT, index=False)
print("Saved:", OUTPUT)


Saved: analysis_dataset_step2_clean.csv
