# Example data parsing

How to read data from the NHEFS study and load it with pandas.




In [None]:
import os
import re
import pandas as pd

def parse_sas_code(sas_file_path):
    with open(sas_file_path, 'r') as f:
        lines = f.readlines()

    widths = []
    names = []
    # SAS code lines defining variables look like:
    #  @1  VAR1  $CHAR3.
    # or
    #  @1  VAR1  3.
    pattern = re.compile(r'@(\d+)\s+(\w+)\s+(\$?CHAR\d+|\d+\.?)', re.IGNORECASE)
    
    for line in lines:
        match = pattern.search(line)
        if match:
            start_pos = int(match.group(1))
            var_name = match.group(2)
            fmt = match.group(3).upper()
            # Determine width from format
            if 'CHAR' in fmt:
                width = int(re.search(r'\d+', fmt).group())
            else:
                width = int(re.search(r'\d+', fmt).group())
            names.append(var_name)
            widths.append(width)

    # Convert start positions and widths to widths array for read_fwf
    # SAS positions are 1-based start positions
    # widths for read_fwf is list of column widths in order
    # But SAS code might not be sorted - let's sort by start_pos
    # So first gather (start_pos, width, var_name), sort by start_pos
    vars_info = []
    for line in lines:
        match = pattern.search(line)
        if match:
            start_pos = int(match.group(1))
            var_name = match.group(2)
            fmt = match.group(3).upper()
            width = int(re.search(r'\d+', fmt).group())
            vars_info.append((start_pos, width, var_name))
    vars_info.sort(key=lambda x: x[0])

    widths = [w for _, w, _ in vars_info]
    names = [n for _, _, n in vars_info]

    return widths, names

def load_data(txt_path, sas_code_path):
    widths, names = parse_sas_code(sas_code_path)
    df = pd.read_fwf(txt_path, widths=widths, names=names)
    return df


#
# NHEFS datasets
#
datasets = {
    "1992_Nhefs_Vital_and_Tracing_Status": (
        "1992_Nhefs_Vital_and_Tracing_Status/n92vitl.txt",
        "1992_Nhefs_Vital_and_Tracing_Status/vitl.inputs.labels.txt",
    ),
    "1992_Nhefs_Mortality": (
        "1992_Nhefs_Mortality/N92mort.txt",
        "1992_Nhefs_Mortality/mort.inputs.labels.txt",
    ),
    "1992_Nhefs_Interview": (
        "1992_Nhefs_Interview/N92int.txt",
        "1992_Nhefs_Interview/intv92.inputs.labels.txt",
    ),
    "1987_Nhefs_Interview": (
        "1987_Nhefs_Interview/n87int.txt",
        "1987_Nhefs_Interview/intv87.inputs.labels.txt",
    ),
    "1986_Nhefs_Interview": (
        "1986_Nhefs_Interview/n86int.txt",
        "1986_Nhefs_Interview/intv86.inputs.labels.txt",
    ),
    "1982_1984_Nhefs_Interview": (
        "1982_1984_Nhefs_Interview/N82int.txt",
        "1982_1984_Nhefs_Interview/intv82.inputs.labels.txt",
    ),
    "1992_Nhefs_Health_Care_Facility_Stay": (
        "1992_Nhefs_Health_Care_Facility_Stay/n92hcfs.txt",
        "1992_Nhefs_Health_Care_Facility_Stay/hcfs92.inputs.labels.txt",
    ),
    "1987_Nhefs_Health_Care_Facility_Stay": (
        "1987_Nhefs_Health_Care_Facility_Stay/n87hcfs.txt",
        "1987_Nhefs_Health_Care_Facility_Stay/hcfs87.inputs.labels.txt",
    ),
    "1986_Nhefs_Health_Care_Facility_Stay": (
        "1986_Nhefs_Health_Care_Facility_Stay/n86hcfs.txt",
        "1986_Nhefs_Health_Care_Facility_Stay/hcfs86.inputs.labels.txt",
    ),
    "1982_1984_Nhefs_Revised_Health_Care_Facility_Stay": (
        "1982_1984_Nhefs_Revised_Health_Care_Facility_Stay/n82hcfs.txt",
        "1982_1984_Nhefs_Revised_Health_Care_Facility_Stay/hcfs82.inputs.labels.txt",
    ),
    "Nhefs_Supplemental_Health_Care_Facility_Stay": (
        "Nhefs_Supplemental_Health_Care_Facility_Stay/n92hcfsp.txt",
        "Nhefs_Supplemental_Health_Care_Facility_Stay/hcfssup.inputs.labels.txt",
    ),
}

dataframes = {}
for ds, (txt_file, sas_file) in datasets.items():
    print(f"Loading {ds} ...")
    df = load_data(txt_file, sas_file)
    dataframes[ds] = df
    print(f"{ds} loaded: {df.shape} rows x cols")




Loading 1992_Nhefs_Vital_and_Tracing_Status ...
1992_Nhefs_Vital_and_Tracing_Status loaded: (14407, 116) rows x cols
Loading 1992_Nhefs_Mortality ...
1992_Nhefs_Mortality loaded: (4497, 84) rows x cols
Loading 1992_Nhefs_Interview ...
1992_Nhefs_Interview loaded: (9281, 1870) rows x cols
Loading 1987_Nhefs_Interview ...
1987_Nhefs_Interview loaded: (9998, 1010) rows x cols
Loading 1986_Nhefs_Interview ...
1986_Nhefs_Interview loaded: (3608, 841) rows x cols
Loading 1982_1984_Nhefs_Interview ...
1982_1984_Nhefs_Interview loaded: (12220, 1663) rows x cols
Loading 1992_Nhefs_Health_Care_Facility_Stay ...
1992_Nhefs_Health_Care_Facility_Stay loaded: (10535, 104) rows x cols
Loading 1987_Nhefs_Health_Care_Facility_Stay ...
1987_Nhefs_Health_Care_Facility_Stay loaded: (7361, 104) rows x cols
Loading 1986_Nhefs_Health_Care_Facility_Stay ...
1986_Nhefs_Health_Care_Facility_Stay loaded: (5405, 109) rows x cols
Loading 1982_1984_Nhefs_Revised_Health_Care_Facility_Stay ...
1982_1984_Nhefs_Revised

In [3]:
# 
# Example to access one dataframe
# 
print(dataframes['1992_Nhefs_Vital_and_Tracing_Status'].head())

   BLANK1  SEQNUM  CVITALST  DTLKAMO  DTLKADY  DTLKAYR  SCVSDKLA  NDOBMO  \
0     NaN       1         1        6       16       92         4      10   
1     NaN       2         1        6       15       92         4       9   
2     NaN       3         4        4       28       71         9       1   
3     NaN       7         3        8       28       86         3      12   
4     NaN       8         1        9        3       92         4      10   

   BLANK2  NDOBYR  ...  RVSTRTA1  REVPSU1  RVSTRTA2  REVPSU2  POVARIND  \
0     NaN    1931  ...       3.0      1.0         3        1         1   
1     NaN    1909  ...       3.0      1.0         3        1         1   
2     NaN    1946  ...       3.0      3.0         3        3         1   
3     NaN    1935  ...       3.0      1.0         3        1         1   
4     NaN    1935  ...       3.0      1.0         3        1         2   

   STRSNEXI  STRS82  STRS86  STRS87  STRS92  
0        PA      PA      98      PA      PA  
1     

In [None]:
#
# Display head for each dataframe
#

for ds_name, df in dataframes.items():
    print(f"\n{ds_name} DataFrame Head:")
    print(df.head())
    print("\n\n")


1992_Nhefs_Vital_and_Tracing_Status DataFrame Head:
   BLANK1  SEQNUM  CVITALST  DTLKAMO  DTLKADY  DTLKAYR  SCVSDKLA  NDOBMO  \
0     NaN       1         1        6       16       92         4      10   
1     NaN       2         1        6       15       92         4       9   
2     NaN       3         4        4       28       71         9       1   
3     NaN       7         3        8       28       86         3      12   
4     NaN       8         1        9        3       92         4      10   

   BLANK2  NDOBYR  NEXAMMO  NEXAMDY  NEXAMYR  NEXAMAGE  STRATA  PSU  SEXSUBJ  \
0     NaN    1931        5       20       71        39       3    2        2   
1     NaN    1909        5       22       71        61       3    2        1   
2     NaN    1946        4       28       71        25       3   35        2   
3     NaN    1935        5       19       71        35       3   39        2   
4     NaN    1935        4       28       71        35       3   84        2   

   REVRAC