In [None]:
import pandas as pd
import os
import re
from typing import List, Dict
from IPython.display import displayimport os

In [2]:
def convert_stata_file(input_path, output_format='csv', output_path=None):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input file not found: {input_path}")
        
    if output_format.lower() not in ['csv', 'pkl']:
        raise ValueError("Output format must be 'csv' or 'pkl'")
    
    if output_path is None:
        base_path = os.path.splitext(input_path)[0]
        output_path = f"{base_path}.{output_format.lower()}"
    
    try:
        print(f"Reading Stata file: {input_path}")
        df = pd.read_stata(input_path)
        
        if output_format.lower() == 'csv':
            print(f"Converting to CSV: {output_path}")
            df.to_csv(output_path, index=False)
        else:
            print(f"Converting to pickle: {output_path}")
            df.to_pickle(output_path)
            
        print(f"Conversion complete. File saved as: {output_path}")
        return output_path
        
    except Exception as e:
        print(f"Error during conversion: {str(e)}")
        raise
    

Reading Stata file: /Users/michaelzhu/Desktop/Stanford/CS229/randhrs1992_2020v2_STATA/randhrs1992_2020v2.dta
Converting to pickle: /Users/michaelzhu/Desktop/Stanford/CS229/randhrs1992_2020v2_STATA/randhrs1992_2020v2.pkl
Conversion complete. File saved as: /Users/michaelzhu/Desktop/Stanford/CS229/randhrs1992_2020v2_STATA/randhrs1992_2020v2.pkl


In [None]:
convert_stata_file(
    input_path="randhrs1992_2020v2_STATA/randhrs1992_2020v2.dta",
    output_format="pkl",
    output_path="randhrs1992_2020v2_STATA/randhrs1992_2020v2.pkl"
)

In [3]:
df = pd.read_pickle("randhrs1992_2020v2_STATA/randhrs1992_2020v2.pkl")
print(f"{len(df)} rows and {len(df.columns)} columns")

In [6]:
def get_selected_vars(df: pd.DataFrame, wave: int, prefix: str) -> List[str]:
    selected_vars = ['hhidpn', f'inw{wave}']
    
    all_vars = df.columns.tolist()
    
    wave_pattern = f"{prefix}{wave}[A-Za-z]"
    always_pattern = f"^{prefix}a[A-Za-z]"
    
    for var in all_vars:
        if re.search(wave_pattern, var) or re.search(always_pattern, var):
            selected_vars.append(var)
            
    return selected_vars

def process_wave_data(df: pd.DataFrame, wave: int, prefix: str) -> pd.DataFrame:

    selected_vars = get_selected_vars(df, wave, prefix)
    
    wave_df = df[selected_vars].copy()
    wave_df = wave_df[wave_df[f'inw{wave}'] == 1].copy()
    
    wave_df['wave'] = wave
    
    rename_dict = {}
    for var in selected_vars:
        if prefix == 's' and var in [f's{wave}tr20', f's{wave}tr40']:
            # Special case for tr20 and tr40
            new_name = var.replace(f's{wave}tr', f's{wave}rtr')
            rename_dict[var] = new_name
            
        if var != 'hhidpn' and var != f'inw{wave}':
            new_name = var.replace(f'{prefix}{wave}', prefix)
            rename_dict[var] = new_name
    
    wave_df = wave_df.rename(columns=rename_dict)
    
    # Drop inw column
    if prefix in ['r', 'h']:
        wave_df = wave_df.drop(columns=[f'inw{wave}'])
        
    return wave_df
    

In [None]:
hrs_data = df

prefixes = ['s', 'r', 'h']
combined_data = {}

for prefix in prefixes:
    prefix_data = []
    for wave in range(1, 16):  
        wave_data = process_wave_data(hrs_data, wave, prefix)
        prefix_data.append(wave_data)
    
    combined_data[prefix] = pd.concat(prefix_data, axis=0, ignore_index=True)

final_data = combined_data['s']
for prefix in ['r', 'h']:
    final_data = pd.merge(
        final_data, 
        combined_data[prefix],
        on=['hhidpn', 'wave'],
        how='inner',
        validate='1:1'
    )

final_data.to_pickle('randhrs1992_2020v2_STATA/randhrs1992_2020v2_long_compact.pkl')