# NYTD Data Validation and Cleaning - All-in-One Notebook

This notebook combines configuration, S3 data loading, validation, and processing logic for NYTD Outcomes data.

- **Source S3 Bucket:** `bdc-public-raw/ndacan/nytd/outcomes/`
- **Supported Datasets:** 202, 228, 266, 297
- **Framework:** Belmont Data Collaborative

In [2]:
"""
NYTD Dataset {DATASET_NUMBER} Processing Documentation
Cohort Year: {COHORT_YEAR}
Processing Date: {DATE}
Version: 1.0.0
"""

'\nNYTD Dataset {DATASET_NUMBER} Processing Documentation\nCohort Year: {COHORT_YEAR}\nProcessing Date: {DATE}\nVersion: 1.0.0\n'

In [35]:
# 1. Imports and Setup
import pandas as pd
import numpy as np
import boto3
import io
import json
from datetime import datetime
import logging
from pathlib import Path
import re
from typing import Dict, List, Optional
# light version tag for your cleaning code
__version__ = "1.0.0"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
s3 = boto3.client('s3')

In [2]:
# 3. NYTD Dataset Configuration


NYTD_DATASETS = {
    '202': {'cohort_year': 2011, 'waves': [1,2,3], 'has_fips5': False},
    '228': {'cohort_year': 2014, 'waves': [1,2,3], 'has_fips5': False},
    '266': {'cohort_year': 2017, 'waves': [1,2,3], 'has_fips5': True},
    '297': {'cohort_year': 2020, 'waves': [1,2,3], 'has_fips5': True},
}

# These are the actual columns in the .tab.gz files:
# 1) Core & Outcome columns (unchanged)
NYTD_CORE_COLUMNS = [
    'Wave','StFCID','StFIPS','St','RecNumbr','RepDate','DOB','Sex','AmIAKN',
    'Asian','BlkAfrAm','HawaiiPI','White','RaceUnkn','RaceDcln',
    'HisOrgin','OutcmRpt','OutcmDte','OutcmFCS'
]

NYTD_OUTCOME_VARIABLES = [
    'CurrFTE','CurrPTE','EmplySklls','SocSecrty','EducAid','PubFinAs',
    'PubFoodAs','PubHousAs','OthrFinAs','HighEdCert','CurrenRoll',
    'CnctAdult','Homeless','SubAbuse','Incarc','Children','Marriage',
    'Medicaid','OthrHlthIn','MedicalIn','MentlHlthIn','PrescripIn'
]

# 2) Base derived (present in every cohort)
NYTD_BASE_DERIVED = ['Baseline','Elig19','Elig21','SampleState','InSample','Responded']






In [5]:
# 2. S3 Utilities & Data Loader

def __init__(self, raw_bucket: str = 'bdc-public-raw', 
                 curated_bucket: str = 'bdc-public-curated'):
        self.raw_bucket = raw_bucket
        self.curated_bucket = curated_bucket
        self.s3 = boto3.client('s3')



In [33]:
class NYTDDataLoader:
    def __init__(self, raw_bucket='bdc-public-raw', curated_bucket='bdc-public-curated'):
        self.raw_bucket = raw_bucket
        self.curated_bucket = curated_bucket
        self.s3 = boto3.client('s3')

    def list_s3_files(self, bucket, prefix="", suffix=None):
        """List files in S3 bucket"""
        try:
            paginator = self.s3.get_paginator('list_objects_v2')
            keys = []
            for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
                for obj in page.get('Contents', []):
                    key = obj['Key']
                    if not suffix or key.lower().endswith(suffix.lower()):
                        keys.append(key)
            print(f"Found {len(keys)} files in s3://{bucket}/{prefix}")
            return keys
        except Exception as e:
            print(f"Error listing files: {e}")
            return []

    def load_gz_from_s3(self, bucket, key):
        try:
            obj = self.s3.get_object(Bucket=bucket, Key=key)
            compressed = io.BytesIO(obj['Body'].read())
            df = pd.read_csv(compressed, sep='\t', compression='gzip', dtype=str)
            print(f"Loaded {df.shape[0]} rows, {df.shape[1]} columns from {key}")
            return df
        except Exception as e:
            print(f"Error loading {key}: {e}")
            return None

    def get_expected_columns(self, dataset_number):
        """Get expected columns for dataset"""
        if dataset_number not in NYTD_DATASETS:
            return []
        meta = NYTD_DATASETS[dataset_number]
        cols = NYTD_CORE_COLUMNS + NYTD_OUTCOME_VARIABLES + NYTD_BASE_DERIVED
        yy = str(meta['cohort_year'])[-2:]
        cols.append(f"FY{yy}Cohort")
        if meta.get('has_fips5'):
            cols += ['Race', 'RaceEthn', 'FIPS5']
        return cols


In [19]:
def clean_data(df, dataset_number):
    """Clean the dataset - FIXED VERSION"""
    df_clean = df.copy()
    
    print(f"Starting cleaning for dataset {dataset_number}")
    print(f"Original shape: {df_clean.shape}")
    
    # 1. Handle missing values
    print("Step 1: Handling missing values...")
    for col in df_clean.columns:
        if df_clean[col].isna().sum() == 0:
            continue
        
        # Numeric columns
        if col in ['Wave', 'StFIPS', 'RecNumbr']:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())
        else:
            # Categorical - use mode or 'Unknown'
            mode = df_clean[col].mode()
            fill_value = mode.iloc[0] if len(mode) > 0 else "Unknown"
            df_clean[col] = df_clean[col].fillna(fill_value)
    
    # 2. Standardize variables
    print("Step 2: Standardizing variables...")
    if "Sex" in df_clean.columns:
        df_clean["Sex"] = df_clean["Sex"].str.strip().str.upper()
    
    if "St" in df_clean.columns:
        df_clean["St"] = df_clean["St"].str.strip().str.upper()
    
    # Standardize Yes/No outcome variables
    outcome_cols = [col for col in NYTD_OUTCOME_VARIABLES if col in df_clean.columns]
    for col in outcome_cols:
        df_clean[col] = df_clean[col].str.strip().str.upper()
    
    # 3. Validate data
    print("Step 3: Validating data...")
    meta = NYTD_DATASETS[dataset_number]
    
    # Filter valid waves
    if "Wave" in df_clean.columns:
        df_clean["Wave"] = pd.to_numeric(df_clean["Wave"], errors='coerce')
        valid_waves = df_clean["Wave"].isin(meta["waves"])
        df_clean = df_clean[valid_waves]
        print(f"Filtered to valid waves {meta['waves']}: {valid_waves.sum()} records")
    
    # 4. Fix date formats - THE KEY FIX
    print("Step 4: Fixing date formats...")
    
    # Fix RepDate - it's in YYYYMM.0 format
    if "RepDate" in df_clean.columns:
        def fix_repdate(date_val):
            if pd.isna(date_val):
                return pd.NaT
            try:
                date_str = str(date_val).replace('.0', '')
                if len(date_str) == 6:  # YYYYMM format
                    year = date_str[:4]
                    month = date_str[4:6]
                    return pd.to_datetime(f"{year}-{month}-01")
                return pd.NaT
            except:
                return pd.NaT
        
        df_clean["RepDate"] = df_clean["RepDate"].apply(fix_repdate)
        print(f"Fixed RepDate format: {df_clean['RepDate'].notna().sum()} valid dates")
    
    # Handle other dates
    if "DOB" in df_clean.columns:
        df_clean["DOB"] = pd.to_datetime(df_clean["DOB"], errors="coerce")
    
    if "OutcmDte" in df_clean.columns:
        df_clean["OutcmDte"] = pd.to_datetime(df_clean["OutcmDte"], errors="coerce")
    
    # Only remove records that were originally missing RepDate
    if "RepDate" in df_clean.columns:
        before_filter = len(df_clean)
        original_repdate_missing = df["RepDate"].isna()
        df_clean = df_clean[~original_repdate_missing]
        print(f"Removed {before_filter - len(df_clean)} records with originally missing RepDate")
    
    # Add metadata
    df_clean["processed_date"] = datetime.now().strftime("%Y-%m-%d")
    
    print(f"Final shape: {df_clean.shape}")
    return df_clean

In [11]:
def save_to_s3_csv(df, bucket, key):
    """Save DataFrame as CSV to S3"""
    try:
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False)
        s3.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        print(f"Saved to s3://{bucket}/{key}")
        return True
    except Exception as e:
        print(f"Error saving to S3: {e}")
        return False

print("Step 3: Helper functions loaded")

Step 3: Helper functions loaded


In [12]:
# Function to just load and peek at data without processing
def peek_dataset(dataset_number, raw_bucket='bdc-public-raw'):
    """Just load and show head() of a dataset"""
    print(f"\n{'='*40}")
    print(f"PEEKING AT DATASET {dataset_number}")
    print(f"{'='*40}")
    
    # Find the file
    prefix = 'ndacan/nytd/outcomes/'
    files = list_s3_files(raw_bucket, prefix, suffix='.tab.gz')
    
    file_patterns = {
        '202': 'outcomes_C14.tab.gz',
        '228': 'outcomes_C14.tab.gz', 
        '266': 'outcomes_C17.tab.gz',
        '297': 'Outcomes20_w3.tab.gz'
    }
    
    target_file = None
    if dataset_number in file_patterns:
        expected_pattern = file_patterns[dataset_number]
        for file_key in files:
            if f'/{dataset_number}/' in file_key and expected_pattern in file_key:
                target_file = file_key
                break
    
    if not target_file:
        print(f"No file found for dataset {dataset_number}")
        return None
    
    # Load and show data
    df = load_gz_from_s3(raw_bucket, target_file)
    if df is not None:
        print(f"Dataset shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("\nFirst 5 rows:")
        print(df.head())
        print("\nData types:")
        print(df.dtypes.head(10))
        return df
    return None

# Peek at each dataset
for dataset_num in ['202', '228', '266', '297']:
    df = peek_dataset(dataset_num)
    if df is not None:
        print(f"\nüìä Dataset {dataset_num} sample:")
        print(df.head(3))  # Show just 3 rows for brevity


PEEKING AT DATASET 202
Found 5 files in s3://bdc-public-raw/ndacan/nytd/outcomes/
Loaded 58231 rows, 48 columns from ndacan/nytd/outcomes/202/outcomes_C14.tab.gz
Dataset shape: (58231, 48)
Columns: ['Wave', 'StFCID', 'StFIPS', 'St', 'RecNumbr', 'RepDate', 'DOB', 'Sex', 'AmIAKN', 'Asian', 'BlkAfrAm', 'HawaiiPI', 'White', 'RaceUnkn', 'RaceDcln', 'HisOrgin', 'OutcmRpt', 'OutcmDte', 'OutcmFCS', 'CurrFTE', 'CurrPTE', 'EmplySklls', 'SocSecrty', 'EducAid', 'PubFinAs', 'PubFoodAs', 'PubHousAs', 'OthrFinAs', 'HighEdCert', 'CurrenRoll', 'CnctAdult', 'Homeless', 'SubAbuse', 'Incarc', 'Children', 'Marriage', 'Medicaid', 'OthrHlthIn', 'MedicalIn', 'MentlHlthIn', 'PrescripIn', 'Baseline', 'FY11Cohort', 'Elig19', 'Elig21', 'SampleState', 'InSample', 'Responded']

First 5 rows:
  Wave          StFCID StFIPS  St      RecNumbr   RepDate         DOB Sex  \
0    1  AK450290395006      2  AK  450290395006  201103.0  1993-10-15   2   
1    1  AK450448396586      2  AK  450448396586  201103.0  1993-12-15   

In [13]:
# Main processing function
def process_dataset(dataset_number, raw_bucket='bdc-public-raw', curated_bucket='bdc-public-curated'):
    """Process a single dataset"""
    print(f"\n{'='*60}")
    print(f"PROCESSING DATASET {dataset_number}")
    print(f"{'='*60}")
    
    # Find the file
    prefix = 'ndacan/nytd/outcomes/'
    files = list_s3_files(raw_bucket, prefix, suffix='.tab.gz')
    
    # Map dataset numbers to their actual file patterns
    file_patterns = {
        '202': 'outcomes_C14.tab.gz',
        '228': 'outcomes_C14.tab.gz', 
        '266': 'outcomes_C17.tab.gz',
        '297': 'Outcomes20_w3.tab.gz'
    }
    
    target_file = None
    if dataset_number in file_patterns:
        expected_pattern = file_patterns[dataset_number]
        # Look for files that contain the dataset number in the path AND match the pattern
        for file_key in files:
            if f'/{dataset_number}/' in file_key and expected_pattern in file_key:
                target_file = file_key
                break
    
    if not target_file:
        print(f"No file found for dataset {dataset_number}")
        print("Available files:")
        for f in files:
            if f.endswith('.tab.gz'):
                print(f"  {f}")
        return None
    
    print(f"Found file: {target_file}")
    
    # Load data
    df = load_gz_from_s3(raw_bucket, target_file)
    if df is None:
        return None
    
    # Check expected columns
    expected_cols = get_expected_columns(dataset_number)
    missing_cols = [col for col in expected_cols if col not in df.columns]
    extra_cols = [col for col in df.columns if col not in expected_cols]
    
    print(f"Expected columns: {len(expected_cols)}")
    print(f"Actual columns: {len(df.columns)}")
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
    if extra_cols:
        print(f"Extra columns: {len(extra_cols)} (first 5: {extra_cols[:5]})")
    
    # Analyze missing values
    print("\nMissing value analysis:")
    missing_report = analyze_missing_values(df)
    print(missing_report.head(10))
    
    # Clean data
    df_clean = clean_data(df, dataset_number)
    
    # Save to S3
    output_key = f"nytd/outcomes/cleaned/nytd_outcomes_{dataset_number}_cleaned.csv"
    success = save_to_s3_csv(df_clean, curated_bucket, output_key)
    
    # Save locally as backup
    local_file = f"nytd_outcomes_{dataset_number}_cleaned.csv"
    df_clean.to_csv(local_file, index=False)
    print(f"Also saved locally: {local_file}")
    
    # Create summary
    summary = {
        "dataset": dataset_number,
        "cohort_year": NYTD_DATASETS[dataset_number]['cohort_year'],
        "processed_date": datetime.now().isoformat(),
        "original_shape": df.shape,
        "cleaned_shape": df_clean.shape,
        "missing_columns": missing_cols,
        "records_removed": df.shape[0] - df_clean.shape[0],
        "s3_saved": success,
        "local_file": local_file
    }
    
    # Save summary
    summary_file = f"processing_summary_{dataset_number}.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    
    print(f"\nProcessing complete! Summary saved to {summary_file}")
    return summary

print("‚úÖ Step 4: Main processing function loaded")
print("\nReady to process! Use: process_dataset('202') to start with dataset 202")
print("Available datasets:", list(NYTD_DATASETS.keys()))

‚úÖ Step 4: Main processing function loaded

Ready to process! Use: process_dataset('202') to start with dataset 202
Available datasets: ['202', '228', '266', '297']


In [14]:
def simple_process_dataset(dataset_number):
    """Simplified version to identify the bottleneck, now only TN rows."""
    print(f"üîÑ Processing {dataset_number}...")
    
    # Step 1: Load data
    print("  Step 1: Loading data...")
    prefix = 'ndacan/nytd/outcomes/'
    files = list_s3_files('bdc-public-raw', prefix, suffix='.tab.gz')
    
    file_patterns = {
        '202': 'outcomes_C14.tab.gz',
        '228': 'outcomes_C14.tab.gz', 
        '266': 'outcomes_C17.tab.gz',
        '297': 'Outcomes20_w3.tab.gz'
    }
    
    target_file = None
    if dataset_number in file_patterns:
        expected_pattern = file_patterns[dataset_number]
        for file_key in files:
            if f'/{dataset_number}/' in file_key and expected_pattern in file_key:
                target_file = file_key
                break
    
    if not target_file:
        print("  ‚ùå No file found")
        return None
    
    df = load_gz_from_s3('bdc-public-raw', target_file)
    if df is None:
        print("  ‚ùå Failed to load data")
        return None
    
    print(f"  ‚úÖ Loaded {df.shape[0]} rows")
    
    # Step 2: Basic cleaning + filter to TN only
    print("  Step 2: Basic cleaning + filtering to TN...")
    df_clean = df.copy()
    
    # <-- Here‚Äôs the filter:
    df_clean = df_clean[df_clean["St"] == "TN"]
    print(f"     ‚Ä¢ Kept only TN ‚Üí {df_clean.shape[0]} rows remain")
    
    # Add processed date
    df_clean["processed_date"] = datetime.now().strftime("%Y-%m-%d")
    
    # Step 3: Save locally
    print("  Step 3: Saving...")
    local_file = f"simple_{dataset_number}_cleaned.csv"
    df_clean.to_csv(local_file, index=False)
    print(f"  ‚úÖ Saved: {local_file}")
    
    return {
        "dataset": dataset_number,
        "shape": df_clean.shape,
        "file": local_file
    }


In [15]:
# Force save each dataset as separate CSV files
def save_all_datasets_separately():
    """Load and save each dataset as a separate CSV file"""
    
    file_patterns = {
        '202': 'outcomes_C14.tab.gz',
        '228': 'outcomes_C14.tab.gz', 
        '266': 'outcomes_C17.tab.gz',
        '297': 'Outcomes20_w3.tab.gz'
    }
    
    saved_files = []
    
    for dataset_num in ['202', '228', '266', '297']:
        print(f"\nüîÑ Processing Dataset {dataset_num}...")
        
        try:
            # Find and load the file
            prefix = 'ndacan/nytd/outcomes/'
            files = list_s3_files('bdc-public-raw', prefix, suffix='.tab.gz')
            
            target_file = None
            expected_pattern = file_patterns[dataset_num]
            for file_key in files:
                if f'/{dataset_num}/' in file_key and expected_pattern in file_key:
                    target_file = file_key
                    break
            
            if not target_file:
                print(f"‚ùå No file found for dataset {dataset_num}")
                continue
            
            # Load data
            df = load_gz_from_s3('bdc-public-raw', target_file)
            if df is None:
                print(f"‚ùå Failed to load dataset {dataset_num}")
                continue
            
            print(f"‚úÖ Loaded {df.shape[0]:,} rows, {df.shape[1]} columns")
            
            # Clean the data
            df_clean = clean_data(df, dataset_num)
            print(f"‚úÖ Cleaned to {df_clean.shape[0]:,} rows, {df_clean.shape[1]} columns")
            
            # Save as separate CSV files
            csv_filename = f"NYTD_Dataset_{dataset_num}_Cleaned.csv"
            df_clean.to_csv(csv_filename, index=False)
            
            file_size = os.path.getsize(csv_filename)
            print(f"‚úÖ Saved: {csv_filename} ({file_size:,} bytes)")
            
            saved_files.append({
                'dataset': dataset_num,
                'filename': csv_filename,
                'rows': df_clean.shape[0],
                'columns': df_clean.shape[1],
                'file_size_bytes': file_size
            })
            
        except Exception as e:
            print(f"‚ùå Error processing dataset {dataset_num}: {e}")
    
    return saved_files

# Run the function to save all files
print("üöÄ Saving all datasets as separate CSV files...")
saved_files = save_all_datasets_separately()

# Show summary
print(f"\n{'='*60}")
print("SAVED FILES SUMMARY")
print(f"{'='*60}")

if saved_files:
    summary_df = pd.DataFrame(saved_files)
    print(summary_df)
    
    print(f"\n‚úÖ Successfully saved {len(saved_files)} datasets!")
    print("\nFiles created:")
    for file_info in saved_files:
        print(f"  üìÑ {file_info['filename']} - {file_info['rows']:,} rows")
else:
    print("‚ùå No files were saved successfully")

# Verify files exist
print(f"\n{'='*30}")
print("FILE VERIFICATION")
print(f"{'='*30}")

import os
csv_files = [f for f in os.listdir('.') if f.startswith('NYTD_Dataset_') and f.endswith('_Cleaned.csv')]
print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    size = os.path.getsize(f)
    print(f"  ‚úÖ {f} ({size:,} bytes)")

üöÄ Saving all datasets as separate CSV files...

üîÑ Processing Dataset 202...
Found 5 files in s3://bdc-public-raw/ndacan/nytd/outcomes/
Loaded 58231 rows, 48 columns from ndacan/nytd/outcomes/202/outcomes_C14.tab.gz
‚úÖ Loaded 58,231 rows, 48 columns
Starting cleaning for dataset 202
Original shape: (58231, 48)
Step 1: Handling missing values...
Step 2: Standardizing variables...
Step 3: Validating data...
Filtered to valid waves [1, 2, 3]: 58231 records
Step 4: Fixing date formats...
Fixed RepDate format: 58231 valid dates
Removed 360 records with originally missing RepDate
Final shape: (57871, 49)
‚úÖ Cleaned to 57,871 rows, 49 columns
‚ùå Error processing dataset 202: name 'os' is not defined

üîÑ Processing Dataset 228...
Found 5 files in s3://bdc-public-raw/ndacan/nytd/outcomes/
Loaded 52199 rows, 49 columns from ndacan/nytd/outcomes/228/outcomes_C14.tab.gz
‚úÖ Loaded 52,199 rows, 49 columns
Starting cleaning for dataset 228
Original shape: (52199, 49)
Step 1: Handling missi

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Build a pipeline: first impute, then fit logistic regression
pipeline = make_pipeline(
    SimpleImputer(strategy="median"),            # fill numeric NaNs with median
    LogisticRegression(penalty="l1", solver="saga", max_iter=10000)
)

# Fit on training data
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Extract coefficients (after imputation, pipeline.named_steps["logisticregression"].coef_)
lr = pipeline.named_steps["logisticregression"]
coef = pd.Series(lr.coef_[0], index=X.columns)
top = coef.abs().sort_values(ascending=False).head(10)
print("Top 10 predictors of harder life in Wave 3:")
print(top.to_frame("|coef|").join(coef.to_frame("coef")))


 'Overall_Food_Insecurity_Rate_3.0'
 'n_of_Food_Insecure_Persons_Overall_1.0'
 'n_of_Food_Insecure_Persons_Overall_2.0'
 'n_of_Food_Insecure_Persons_Overall_3.0'
 'Food_Insecurity_Rate_among_Black_Persons_(all_ethnicities)_1.0'
 'Food_Insecurity_Rate_among_Black_Persons_(all_ethnicities)_2.0'
 'Food_Insecurity_Rate_among_Black_Persons_(all_ethnicities)_3.0'
 'Food_Insecurity_Rate_among_Hispanic_Persons_(any_race)_1.0'
 'Food_Insecurity_Rate_among_Hispanic_Persons_(any_race)_2.0'
 'Food_Insecurity_Rate_among_Hispanic_Persons_(any_race)_3.0'
 'Food_Insecurity_Rate_among_White,_non-Hispanic_Persons_1.0'
 'Food_Insecurity_Rate_among_White,_non-Hispanic_Persons_2.0'
 'Food_Insecurity_Rate_among_White,_non-Hispanic_Persons_3.0'
 'SNAP_Threshold_1.0' 'SNAP_Threshold_2.0' 'SNAP_Threshold_3.0'
 'pct_FI_‚â§_SNAP_Threshold_1.0' 'pct_FI_‚â§_SNAP_Threshold_2.0'
 'pct_FI_‚â§_SNAP_Threshold_3.0' 'pct_FI_>_SNAP_Threshold_1.0'
 'pct_FI_>_SNAP_Threshold_2.0' 'pct_FI_>_SNAP_Threshold_3.0'
 'Child_Food_In

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(0)

In [25]:
# Option 1: Drop rows with any NaNs in X
X_train_clean = X_train.dropna()
y_train_clean = y_train.loc[X_train_clean.index]

pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    LogisticRegression(penalty="l1", solver="saga", max_iter=10000)
)
pipeline.fit(X_train_clean, y_train_clean)


ValueError: Found array with 0 sample(s) (shape=(0, 135)) while a minimum of 1 is required by SimpleImputer.

In [23]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Load (low_memory=False to silence mixed‚Äêtype warnings)
df = pd.read_csv(
    "NYTD_S3_Integrated_Long_Format.csv",
    parse_dates=["RepDate","DOB","OutcmDte","processed_date"],
    low_memory=False
)

# 2. Keep only Tennessee
df = df[df["St"] == "TN"]

# 3. Dedupe so pivot won‚Äôt error
df = df.drop_duplicates(subset=["StFCID","Wave"], keep="first")

# 4. Pivot to wide format
df_feat = df.pivot(
    index="StFCID",
    columns="Wave",
    # include all your FI + NYTD flags here
    values=[
        # FOOD INSECURITY METRICS
        "Overall Food Insecurity Rate",
        "# of Food Insecure Persons Overall",
        "Food Insecurity Rate among Black Persons (all ethnicities)",
        "Food Insecurity Rate among Hispanic Persons (any race)",
        "Food Insecurity Rate among White, non-Hispanic Persons",
        "SNAP Threshold",
        "% FI ‚â§ SNAP Threshold",
        "% FI > SNAP Threshold",
        "Child Food Insecurity Rate",
        "# of Food Insecure Children",
        "% food insecure children in HH w/ HH incomes below 185 FPL",
        "% food insecure children in HH w/ HH incomes above 185 FPL",
        "Cost Per Meal",
        "Weighted weekly $ needed by FI",
        "Weighted Annual Food Budget Shortfall",
        # NYTD FLAGS
        "Sex",
        "AmIAKN","Asian","BlkAfrAm","HawaiiPI","White","RaceUnkn","RaceDcln","HisOrgin",
        "EmplySklls","SocSecrty","EducAid","PubFinAs","PubFoodAs","PubHousAs","OthrFinAs",
        "HighEdCert","CurrenRoll","CnctAdult","Homeless","SubAbuse","Incarc","Children",
        "Marriage","Medicaid","OthrHlthIn","MedicalIn","MentlHlthIn","PrescripIn"
    ]
)

# 5. Flatten column names: e.g. "Homeless_1", "Overall Food Insecurity Rate_2", etc.
df_feat.columns = [f"{col.replace(' ', '_').replace('%','pct').replace('#','n').replace('/','_')}_{wave}"
                   for col, wave in df_feat.columns]

# 6a. Ensure the ‚Äú_3‚Äù columns exist, even if all zeros
for base in ["Homeless", "Incarc", "SubAbuse"]:
    col3 = f"{base}_3"
    if col3 not in df_feat.columns:
        df_feat[col3] = 0

# 6b. Now build the target safely
df_feat["harder3"] = (
    (df_feat["Homeless_3"] == 1) |
    (df_feat["Incarc_3"]   == 1) |
    (df_feat["SubAbuse_3"] == 1)
).astype(int)


# 7. Keep only those with complete Wave 1 & 2 data
mask = df_feat[[c for c in df_feat.columns if c.endswith("_1") or c.endswith("_2")]].notnull().all(axis=1)
df_model = df_feat[mask].dropna(subset=["harder3"])

# 8. Split into X & y
X = df_model.drop(columns=["harder3"])
y = df_model["harder3"]

# 9. Train‚Äêtest split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# 10. Fit a logistic regression with L1 penalty
clf = LogisticRegression(penalty="l1", solver="saga", max_iter=10000)
clf.fit(X_train, y_train)

# 11. Evaluate
print(classification_report(y_test, clf.predict(X_test)))

# 12. Show top 10 predictors by absolute coefficient
import numpy as np
coef = pd.Series(clf.coef_[0], index=X.columns)
top = coef.abs().sort_values(ascending=False).head(10)
print("Top 10 predictors of harder life in Wave 3:")
print(top.to_frame("|coef|").join(coef.to_frame("coef")))


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values