In [47]:
import pandas as pd

# 1. Load original adult file
df = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")

clean = df.copy()

# 2. Keep only the columns we need (note: we add sec1_q1_10 + depression_50)
key_cols = [
    "sec1_hhid",
    "sec1_gov_n",
    "sec1_q1",          # age
    "sec1_q1_10",       # gender: '1. Male', '2. Female'
    "depression",
    "depression_50",    # 1 = poor wellbeing, 0 = good wellbeing
    "distress_swin",
    "life_satisfaction",
    "aggression_swin",
    "ptsd_index",
    "ptsd",
    "ptsd_index_sd",
    "depression_sd",
    "distress_swin_sd",
    "psychological_distress_aggregate",
    "wgt18cal",
]

mh = clean[key_cols].copy()

# 3. Clean depression_50: extract the 0/1 from strings like "1. poor well being"
mh["depression_50"] = (
    mh["depression_50"]
    .astype(str)
    .str.extract(r"(\d+)")[0]   # get the first digit
    .astype(float)
)

# 4. Create a clean gender column: "Male" / "Female"
mh["gender"] = (
    mh["sec1_q1_10"]
    .astype(str)
    .str.extract(r"\.\s*(.*)")[0]  # take text after "1. " or "2. "
)

# 5. Convert numeric columns
num_cols = [
    "sec1_q1",
    "depression",
    "depression_50",
    "distress_swin",
    "life_satisfaction",
    "aggression_swin",
    "ptsd_index",
    "ptsd",
    "ptsd_index_sd",
    "depression_sd",
    "distress_swin_sd",
    "psychological_distress_aggregate",
    "wgt18cal",
]

mh[num_cols] = mh[num_cols].apply(pd.to_numeric, errors="coerce")

# 6. Quick check
print(mh["depression_50"].value_counts(dropna=False))
print(mh["gender"].value_counts(dropna=False))

# 7. Save clean file (overwrite old one)
mh.to_csv("../1_datasets/row_data/ppcs/ppcs_mh_clean.csv", index=False)


depression_50
1.0    3937
0.0    2201
Name: count, dtype: int64
gender
Female    3048
Male      2832
NaN        258
Name: count, dtype: int64


  df = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")


In [39]:
df["depression_50"] = (
    df["depression_50"]
    .astype(str)                 # make sure it’s string
    .str.extract(r"(\d)")        # take the digit (0 or 1)
    .astype(float)               # turn into number
)


In [40]:
df["depression_50"].value_counts(dropna=False)


depression_50
1.0    3937
0.0    2201
Name: count, dtype: int64

In [41]:
df["gender"] = (
    df["sec1_q1_10"]
    .astype(str)
    .str.extract(r"\.\s*(.*)")   # everything after "1. " or "2. "
    [0]
)

df["gender"].value_counts(dropna=False)


gender
Female    3048
Male      2832
NaN        258
Name: count, dtype: int64

In [21]:
# Look at raw depression_50 to confirm format
print(df["depression_50"].head(10))

# Extract the first digit (0 or 1) from strings like "1. Poor well-being"
df["depression_50_clean"] = (
    df["depression_50"]
    .astype(str)
    .str.extract(r"^(\d)")   # grab first digit at start of string
    .astype(float)
)

# Check result
print(df["depression_50_clean"].value_counts(dropna=False))


0    1. Poor well being
1    1. Poor well being
2    0. Good well being
3    0. Good well being
4    0. Good well being
5    1. Poor well being
6    1. Poor well being
7    0. Good well being
8    1. Poor well being
9    1. Poor well being
Name: depression_50, dtype: object
depression_50_clean
1.0    3937
0.0    2201
Name: count, dtype: int64


In [5]:
[c for c in df.columns if "ptsd" in c.lower()]


['ptsd_index', 'ptsd', 'ptsd_index_sd']

In [6]:
import pandas as pd

# 1) Load main adult survey file
df = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")

# 2) Quick checks
print(df.shape)   # rows, columns
df.head()


(6138, 212)


  df = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")


Unnamed: 0,sec1_hhid,sec1_gov_n,sec1_interview,indiv_id,indiv_outcome,indiv_outcome_oth,sec1_q1,sec1_q1_8,sec1_q1_9,sec1_q1_10,...,current_threat,affective_dysregulation,negative_self_concept,disturbance_relation,functional_impairment,ptsd,ptsd_index_sd,depression_sd,distress_swin_sd,psychological_distress_aggregate
0,4.0,1. West Bank,1. Completed,2,1. Completed,,1. Yes,1. Yes,3. Married,2. Female,...,0. No,0. No,0. No,0. No,0. No,0. No,0. No,0.6000000238418579,0.3340757191181183,0.3113585710525512
1,9.0,1. West Bank,1. Completed,1,1. Completed,,1. Yes,1. Yes,3. Married,1. Male,...,0. No,0. No,0. No,0. No,0. No,0. No,0. No,0.6800000071525574,0.3494806885719299,0.3431602418422699
2,14.0,1. West Bank,1. Completed,5,1. Completed,,1. Yes,2. No,1. Never married,1. Male,...,0. No,0. No,0. No,0. No,0. No,0. No,0. No,0.4000000059604645,0.3340757191181183,0.2446919083595276
3,15.0,1. West Bank,1. Completed,2,1. Completed,,1. Yes,1. Yes,3. Married,2. Female,...,0. No,0. No,0. No,0. No,0. No,0. No,0. No,0.4799999892711639,0.2966199517250061,0.25887331366539
4,17.0,1. West Bank,2. Partially completed,1,1. Completed,,1. Yes,1. Yes,3. Married,1. Male,...,0. No,0. No,0. No,0. No,0. No,0. No,0.20000000298023224,0.4399999976158142,0.3340757191181183,0.324691891670227


In [7]:
# See column names and types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6138 entries, 0 to 6137
Columns: 212 entries, sec1_hhid to psychological_distress_aggregate
dtypes: float64(16), int64(1), object(195)
memory usage: 9.9+ MB


In [8]:
clean = df.copy()


In [42]:
key_cols = [
    "sec1_hhid",          # household id
    "sec1_gov_n",         # region / governorate
    "sec1_q1",
    "gender",    
    "sec1_q1_10",   # age (we will confirm from questionnaire)
    # mental health outcomes:
    "depression",
    "depression_50",
    "distress_swin",
    "life_satisfaction",
    "aggression_swin",
    "ptsd_index",
    "ptsd",
    "ptsd_index_sd",
    "depression_sd",
    "distress_swin_sd",
    "psychological_distress_aggregate",
    # survey weight (useful later)
    "wgt18cal"
]

# keep only columns that really exist (in case some are missing)
key_cols = [c for c in key_cols if c in clean.columns]

mh = clean[key_cols].copy()

mh.head()


Unnamed: 0,sec1_hhid,sec1_gov_n,sec1_q1,sec1_q1_10,depression,depression_50,distress_swin,life_satisfaction,aggression_swin,ptsd_index,ptsd,ptsd_index_sd,depression_sd,distress_swin_sd,psychological_distress_aggregate,wgt18cal
0,4.0,1. West Bank,1. Yes,2. Female,60,1. Poor well being,2.0,7.0,1.206873,0.0,0. No,0. No,0.6000000238418579,0.3340757191181183,0.3113585710525512,262.356268
1,9.0,1. West Bank,1. Yes,1. Male,68,1. Poor well being,2.046266,7.5,1.125676,0.0,0. No,0. No,0.6800000071525574,0.3494806885719299,0.3431602418422699,174.979536
2,14.0,1. West Bank,1. Yes,1. Male,40,0. Good well being,2.0,8.0,1.275704,0.0,0. No,0. No,0.4000000059604645,0.3340757191181183,0.2446919083595276,2164.916169
3,15.0,1. West Bank,1. Yes,2. Female,48,0. Good well being,1.887507,7.0,1.129536,0.0,0. No,0. No,0.4799999892711639,0.2966199517250061,0.25887331366539,184.5811
4,17.0,1. West Bank,1. Yes,1. Male,44,0. Good well being,2.0,7.0,1.088409,1.0,0. No,0.20000000298023224,0.4399999976158142,0.3340757191181183,0.324691891670227,218.105682


In [45]:
num_cols = [
    "sec1_q1",
    "depression",
    "depression_50",
    "distress_swin",
    "life_satisfaction",
    "aggression_swin",
    "ptsd_index",
    "ptsd",
    "ptsd_index_sd",
    "depression_sd",
    "distress_swin_sd",
    "psychological_distress_aggregate",
    "wgt18cal"
]

num_cols = [c for c in num_cols if c in mh.columns]

mh[num_cols] = mh[num_cols].apply(pd.to_numeric, errors="coerce")

mh[num_cols].describe()


Unnamed: 0,sec1_q1,depression,depression_50,distress_swin,life_satisfaction,aggression_swin,ptsd_index,ptsd,ptsd_index_sd,depression_sd,distress_swin_sd,psychological_distress_aggregate,wgt18cal
count,0.0,6041.0,0.0,5879.0,5879.0,5879.0,6138.0,0.0,3032.0,5542.0,5866.0,6137.0,5877.0
mean,,59.63847,,2.008524,6.909764,1.886562,0.959035,,0.384667,0.560043,0.335956,0.374085,505.587386
std,,22.375285,,0.358844,1.945842,0.719263,1.120076,,0.156406,0.196424,0.116185,0.131128,1534.970535
min,,4.0,,0.996656,1.0,0.963075,0.0,,0.2,0.04,0.001114,0.000371,3.29905
25%,,40.0,,1.932493,5.5,1.256107,0.0,,0.244444,0.4,0.31159,0.278016,65.116119
50%,,60.0,,2.0,7.0,1.90996,0.0,,0.377778,0.6,0.334076,0.368253,142.180483
75%,,76.0,,2.112601,8.0,2.264049,1.888889,,0.488889,0.72,0.371568,0.470659,331.095347
max,,100.0,,4.0,10.0,5.0,5.0,,0.955556,0.96,0.969385,0.989795,28663.205157


In [44]:
mh.to_csv("../1_datasets/row_data/ppcs/ppcs_mh_clean.csv", index=False)


In [14]:
mh.to_csv("../1_datasets/row_data/ppcs/ppcs_mh_clean.csv", index=False)


In [24]:
import pandas as pd

# Load the original full dataset again
df_full = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")

# Quick check of raw values:
df_full["depression_50"].value_counts(dropna=False).head(10)


  df_full = pd.read_csv("../1_datasets/row_data/ppcs/data_18plus_public.csv")


depression_50
1. Poor well being    3937
0. Good well being    2201
Name: count, dtype: int64

In [17]:
mh["depression_50"] = (
    df_full["depression_50"]          # original text column
        .astype(str)                  # make sure it's a string
        .str.extract(r"(\d)")         # grab the first digit (0 or 1)
        .astype(float)                # convert to numeric
)
