In [188]:
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [189]:
meps = pd.read_stata("h243.dta")
meps

Unnamed: 0,DUID,PID,DUPERSID,PANEL,DATAYEAR,FAMID31,FAMID42,FAMID53,FAMID22,FAMIDYR,...,RXOSR22,RXPTR22,RXOTH22,PERWT22F,FAMWT22F,FAMWT22C,SAQWT22F,DIABW22F,VARSTR,VARPSU
0,2460002,101,2460002101,24 PANEL 24,2022 YEAR,A,A,A,A,A,...,0,0,0,5728.309495,5232.211986,5232.211986,3994.687140,6034.636755,2082,1
1,2460006,101,2460006101,24 PANEL 24,2022 YEAR,A,A,A,A,A,...,0,0,0,15648.881461,16017.881691,16017.881691,0.000000,0.000000,2001,4
2,2460006,102,2460006102,24 PANEL 24,2022 YEAR,A,A,A,A,A,...,0,0,0,14123.720178,16017.881691,12580.731740,0.000000,0.000000,2001,4
3,2460010,101,2460010101,24 PANEL 24,2022 YEAR,A,A,A,A,A,...,0,5288,299,16982.054917,21905.758877,21905.758877,0.000000,0.000000,2038,3
4,2460018,101,2460018101,24 PANEL 24,2022 YEAR,A,A,A,A,A,...,0,10,0,10682.619947,11344.291012,11344.291012,17152.439412,0.000000,2041,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22426,2799694,101,2799694101,27 PANEL 27,2022 YEAR,A,A,A,A,A,...,0,231,0,21490.058502,24757.243908,24757.243908,30689.883735,0.000000,2015,2
22427,2799694,102,2799694102,27 PANEL 27,2022 YEAR,A,A,A,A,A,...,0,79,0,18828.710297,24757.243908,24757.243908,22648.236877,0.000000,2015,2
22428,2799695,101,2799695101,27 PANEL 27,2022 YEAR,A,A,A,A,A,...,0,0,0,15992.756873,18382.130170,18382.130170,20512.235130,0.000000,2045,1
22429,2799698,101,2799698101,27 PANEL 27,2022 YEAR,A,A,A,A,A,...,0,578,0,15771.415106,15430.797933,15430.797933,0.000000,0.000000,2103,3


In [200]:
df = meps  

wanted_cols = [
    # ID (keep DUPERSID)
    "DUPERSID",

    # Demographics
    "AGE22X", "SEX", "RACEV2X", "HISPANX", "MARRY22X",
    "EDUCYR", "FTSTU22X", "REGION22", 

    # Socioeconomic / Income
    "POVCAT22", "FAMINC22", "TTLP22X", "WAGEP22X", "BUSNP22X", "EMPST53",

    # Self-reported health & mental health
    "RTHLTH53", "MNHLTH53", "IADLHP31", "ADLHLP31",
    "ANYLMI22", "K6SUM42", "PHQ242",

    # Chronic conditions
    "HIBPDX", "CHDDX", "ANGIDX", "MIDX", "STRKDX", "EMPHDX",
    "CHOLDX", "CANCERDX", "ARTHDX", "ARTHTYPE", "ASTHDX",
    "ADHDADDX", "DIABDX_M18",

    # Insurance coverage
    "INSCOV22", "INSURC22", "PRVEV22", "TRIEV22",
    "MCREV22", "MCDEV22", "VAEV22", "UNINS22",

    # Utilization / Other
    "ERTOT22", "IPDIS22", "RXTOT22", "PERWT22F",

    # Target
    "TOTEXP22",
]

model_cols = [c for c in wanted_cols if c in df.columns]

meps_model = df[model_cols].copy()

In [201]:
# Demographics
meps_model.head().iloc[:, 0:9]

# Age top coded at 85 (85 indicates >= 85)

Unnamed: 0,DUPERSID,AGE22X,SEX,RACEV2X,HISPANX,MARRY22X,EDUCYR,FTSTU22X,REGION22
0,2460002101,77,2 FEMALE,2 BLACK - NO OTHER RACE REPORTED,2 NOT HISPANIC,2 WIDOWED,6,-1 INAPPLICABLE,2 MIDWEST
1,2460006101,64,2 FEMALE,1 WHITE - NO OTHER RACE REPORTED,2 NOT HISPANIC,3 DIVORCED,14 2 YEARS COLLEGE,-1 INAPPLICABLE,2 MIDWEST
2,2460006102,67,1 MALE,1 WHITE - NO OTHER RACE REPORTED,2 NOT HISPANIC,3 DIVORCED,17 5+ YEARS COLLEGE,-1 INAPPLICABLE,2 MIDWEST
3,2460010101,29,1 MALE,12 MULTIPLE RACES REPORTED,2 NOT HISPANIC,5 NEVER MARRIED,16 4 YEARS COLLEGE,-1 INAPPLICABLE,4 WEST
4,2460018101,51,2 FEMALE,1 WHITE - NO OTHER RACE REPORTED,2 NOT HISPANIC,3 DIVORCED,16 4 YEARS COLLEGE,-1 INAPPLICABLE,1 NORTHEAST


In [202]:
# Socioeconomic / Income
meps_model.head().iloc[:, 9:15]

Unnamed: 0,POVCAT22,FAMINC22,TTLP22X,WAGEP22X,BUSNP22X,EMPST53
0,3 LOW INCOME,22000,22000,0,0,4 NOT EMPLOYED DURING RD 5/3
1,1 POOR/NEGATIVE,5000,5000,0,-4000,4 NOT EMPLOYED DURING RD 5/3
2,4 MIDDLE INCOME,30740,30740,20240,0,3 JOB DURING RD 5/3 REF PERIOD
3,5 HIGH INCOME,106483,106483,106483,0,1 EMPLOYED AT RD 5/3 INT DATE
4,4 MIDDLE INCOME,45000,45000,45000,0,1 EMPLOYED AT RD 5/3 INT DATE


In [203]:
# health & mental health
meps_model.head().iloc[:, 15:22]

Unnamed: 0,RTHLTH53,MNHLTH53,IADLHP31,ADLHLP31,ANYLMI22,K6SUM42,PHQ242
0,3 GOOD,3 GOOD,1 YES,2 NO,1 YES,12,2
1,5 POOR,1 EXCELLENT,2 NO,2 NO,1 YES,-1 INAPPLICABLE,-1 INAPPLICABLE
2,1 EXCELLENT,1 EXCELLENT,2 NO,2 NO,2 NO,-1 INAPPLICABLE,-1 INAPPLICABLE
3,4 FAIR,3 GOOD,2 NO,2 NO,2 NO,-1 INAPPLICABLE,-1 INAPPLICABLE
4,2 VERY GOOD,4 FAIR,2 NO,2 NO,2 NO,5,1


In [204]:
# Chronic conditions
meps_model.head().iloc[:, 22:35]

Unnamed: 0,HIBPDX,CHDDX,ANGIDX,MIDX,STRKDX,EMPHDX,CHOLDX,CANCERDX,ARTHDX,ARTHTYPE,ASTHDX,ADHDADDX,DIABDX_M18
0,1 YES,2 NO,2 NO,2 NO,1 YES,2 NO,1 YES,1 YES,1 YES,3 NOT SPECIFIED,2 NO,-1 INAPPLICABLE,1 YES
1,1 YES,2 NO,2 NO,1 YES,2 NO,2 NO,2 NO,1 YES,1 YES,2 OSTEOARTHRITIS,2 NO,-1 INAPPLICABLE,2 NO
2,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,-1 INAPPLICABLE,2 NO,-1 INAPPLICABLE,2 NO
3,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,-1 INAPPLICABLE,2 NO,-1 INAPPLICABLE,2 NO
4,2 NO,2 NO,2 NO,2 NO,2 NO,2 NO,1 YES,2 NO,1 YES,2 OSTEOARTHRITIS,1 YES,-1 INAPPLICABLE,2 NO


In [205]:
# Insurance
meps_model.head().iloc[:, 35:43]

Unnamed: 0,INSCOV22,INSURC22,PRVEV22,TRIEV22,MCREV22,MCDEV22,VAEV22,UNINS22
0,2 PUBLIC ONLY,6 65+ EDITED MEDICARE AND OTH PUB ONLY,2 NO,2 NO,1 YES,1 YES,2 NO,2 NO
1,2 PUBLIC ONLY,2 <65 PUBLIC ONLY,2 NO,2 NO,1 YES,1 YES,2 NO,2 NO
2,2 PUBLIC ONLY,6 65+ EDITED MEDICARE AND OTH PUB ONLY,2 NO,2 NO,1 YES,1 YES,2 NO,2 NO
3,1 ANY PRIVATE,1 <65 ANY PRIVATE,1 YES,2 NO,2 NO,2 NO,2 NO,2 NO
4,1 ANY PRIVATE,1 <65 ANY PRIVATE,1 YES,2 NO,2 NO,2 NO,2 NO,2 NO


In [209]:
# Utilization / Weight
meps_model.head().iloc[:, 43:47]

Unnamed: 0,ERTOT22,IPDIS22,RXTOT22,PERWT22F
0,0,0,40,5728.309495
1,2,0,43,15648.881461
2,0,0,1,14123.720178
3,0,0,36,16982.054917
4,0,0,23,10682.619947


In [210]:
# Target
meps_model[["TOTEXP22"]].head()

Unnamed: 0,TOTEXP22
0,15766
1,12697
2,3405
3,9265
4,3362
