# Notebook 02: Data Preprocessing & NEET Labeling

**Purpose**: Load raw LFS 2020-21 data, detect variables, create NEET labels, clean data, and save processed dataset.

**Outputs**:
- `data/processed/lfs_youth_cleaned.csv` - Cleaned dataset with NEET labels
- `outputs/tables/neet_distribution.csv` - NEET statistics by demographics
- `outputs/figures/neet_by_demographics.png` - Visualization of NEET rates

## 1. Setup and Imports

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src directory to path
sys.path.append('../src')

from data_preprocessing import (
    load_raw,
    detect_variable_names,
    create_neet_label,
    clean_vars,
    remove_pii
)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports complete")

## 2. Load Raw Data

In [None]:
# Load the LFS 2020-21 Stata file
raw_data_path = '../data/raw/LFS2020-21.dta'

print(f"Loading data from: {raw_data_path}")
df_raw, metadata = load_raw(raw_data_path, verbose=True)

print(f"\n{'='*60}")
print(f"Raw data shape: {df_raw.shape}")
print(f"Columns: {df_raw.shape[1]}")
print(f"Rows: {df_raw.shape[0]:,}")
print(f"{'='*60}")

## 3. Detect Variable Names

LFS datasets may have different column names. We'll auto-detect the relevant variables.

In [None]:
# Auto-detect variable names
var_mapping = detect_variable_names(df_raw, verbose=True)

print("\n" + "="*60)
print("DETECTED VARIABLES:")
print("="*60)
for key, value in var_mapping.items():
    print(f"{key:20s} -> {value}")

## 4. Create NEET Label

**NEET Definition**: Youth aged 15-24 who are:
- **NOT** in education
- **NOT** employed
- **NOT** in training

In [None]:
# Create NEET label
df_labeled = create_neet_label(
    df_raw,
    var_mapping=var_mapping,
    age_min=15,
    age_max=24,
    verbose=True
)

print(f"\n{'='*60}")
print(f"Labeled data shape: {df_labeled.shape}")
print(f"Youth population (15-24): {df_labeled.shape[0]:,}")
print(f"{'='*60}")

## 5. Verify NEET Label Logic

In [None]:
# Check NEET distribution
neet_counts = df_labeled['NEET'].value_counts()
neet_pct = df_labeled['NEET'].value_counts(normalize=True) * 100

print("\n" + "="*60)
print("NEET STATUS DISTRIBUTION")
print("="*60)
print(f"Not NEET: {neet_counts.get(0, 0):,} ({neet_pct.get(0, 0):.1f}%)")
print(f"NEET:     {neet_counts.get(1, 0):,} ({neet_pct.get(1, 0):.1f}%)")
print(f"Total:    {df_labeled.shape[0]:,}")

# Verify logic
print("\n" + "="*60)
print("COMPONENT BREAKDOWN")
print("="*60)
if 'in_education' in df_labeled.columns:
    print(f"In education: {df_labeled['in_education'].sum():,} ({df_labeled['in_education'].mean()*100:.1f}%)")
if 'employed' in df_labeled.columns:
    print(f"Employed:     {df_labeled['employed'].sum():,} ({df_labeled['employed'].mean()*100:.1f}%)")
if 'in_training' in df_labeled.columns:
    print(f"In training:  {df_labeled['in_training'].sum():,} ({df_labeled['in_training'].mean()*100:.1f}%)")

# Show sample records
print("\n" + "="*60)
print("SAMPLE RECORDS (First 10)")
print("="*60)
cols_to_show = [var_mapping.get('age'), var_mapping.get('sex')]
if 'in_education' in df_labeled.columns:
    cols_to_show.append('in_education')
if 'employed' in df_labeled.columns:
    cols_to_show.append('employed')
if 'in_training' in df_labeled.columns:
    cols_to_show.append('in_training')
cols_to_show.append('NEET')

display(df_labeled[cols_to_show].head(10))

## 6. Clean Variables

Standardize sex, province, district, and urban/rural variables.

In [None]:
# Clean variables
df_cleaned = clean_vars(df_labeled, var_mapping=var_mapping, verbose=True)

print(f"\n{'='*60}")
print(f"Cleaned data shape: {df_cleaned.shape}")
print(f"{'='*60}")

## 7. Remove PII

Remove personally identifiable information for privacy protection.

In [None]:
# Remove PII
df_final = remove_pii(df_cleaned, create_hash_id=True, verbose=True)

print(f"\n{'='*60}")
print(f"Final data shape: {df_final.shape}")
print(f"Columns: {df_final.shape[1]}")
print(f"{'='*60}")

## 8. Analyze NEET by Demographics

In [None]:
# NEET rate by gender
sex_col = var_mapping.get('sex')
if sex_col and sex_col in df_final.columns:
    neet_by_sex = df_final.groupby(sex_col)['NEET'].agg(['count', 'sum', 'mean'])
    neet_by_sex.columns = ['Total', 'NEET_Count', 'NEET_Rate']
    neet_by_sex['NEET_Rate'] = neet_by_sex['NEET_Rate'] * 100
    
    print("\n" + "="*60)
    print("NEET RATE BY GENDER")
    print("="*60)
    print(neet_by_sex.to_string())

# NEET rate by province
prov_col = var_mapping.get('province')
if prov_col and prov_col in df_final.columns:
    neet_by_prov = df_final.groupby(prov_col)['NEET'].agg(['count', 'sum', 'mean'])
    neet_by_prov.columns = ['Total', 'NEET_Count', 'NEET_Rate']
    neet_by_prov['NEET_Rate'] = neet_by_prov['NEET_Rate'] * 100
    neet_by_prov = neet_by_prov.sort_values('NEET_Rate', ascending=False)
    
    print("\n" + "="*60)
    print("NEET RATE BY PROVINCE")
    print("="*60)
    print(neet_by_prov.head(10).to_string())

# NEET rate by urban/rural
urban_col = var_mapping.get('urban_rural')
if urban_col and urban_col in df_final.columns:
    neet_by_urban = df_final.groupby(urban_col)['NEET'].agg(['count', 'sum', 'mean'])
    neet_by_urban.columns = ['Total', 'NEET_Count', 'NEET_Rate']
    neet_by_urban['NEET_Rate'] = neet_by_urban['NEET_Rate'] * 100
    
    print("\n" + "="*60)
    print("NEET RATE BY URBAN/RURAL")
    print("="*60)
    print(neet_by_urban.to_string())

## 9. Visualize NEET Rates

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: NEET by Gender
if sex_col and sex_col in df_final.columns:
    neet_by_sex_pct = df_final.groupby(sex_col)['NEET'].mean() * 100
    ax1 = axes[0]
    neet_by_sex_pct.plot(kind='bar', ax=ax1, color=['#3498db', '#e74c3c'])
    ax1.set_title('NEET Rate by Gender', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Gender', fontsize=12)
    ax1.set_ylabel('NEET Rate (%)', fontsize=12)
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
    for i, v in enumerate(neet_by_sex_pct):
        ax1.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)

# Plot 2: NEET by Province (Top 5)
if prov_col and prov_col in df_final.columns:
    neet_by_prov_pct = df_final.groupby(prov_col)['NEET'].mean() * 100
    neet_by_prov_pct = neet_by_prov_pct.sort_values(ascending=False).head(5)
    ax2 = axes[1]
    neet_by_prov_pct.plot(kind='barh', ax=ax2, color='#9b59b6')
    ax2.set_title('NEET Rate by Province (Top 5)', fontsize=14, fontweight='bold')
    ax2.set_xlabel('NEET Rate (%)', fontsize=12)
    ax2.set_ylabel('Province', fontsize=12)
    for i, v in enumerate(neet_by_prov_pct):
        ax2.text(v + 1, i, f'{v:.1f}%', va='center', fontsize=10)

# Plot 3: NEET by Urban/Rural
if urban_col and urban_col in df_final.columns:
    neet_by_urban_pct = df_final.groupby(urban_col)['NEET'].mean() * 100
    ax3 = axes[2]
    neet_by_urban_pct.plot(kind='bar', ax=ax3, color=['#2ecc71', '#f39c12'])
    ax3.set_title('NEET Rate by Area Type', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Area Type', fontsize=12)
    ax3.set_ylabel('NEET Rate (%)', fontsize=12)
    ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)
    for i, v in enumerate(neet_by_urban_pct):
        ax3.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)

plt.tight_layout()

# Save figure
os.makedirs('../outputs/figures', exist_ok=True)
plt.savefig('../outputs/figures/neet_by_demographics.png', dpi=300, bbox_inches='tight')
print("\n✓ Figure saved to: outputs/figures/neet_by_demographics.png")

plt.show()

## 10. Save Processed Data

In [None]:
# Save cleaned dataset
os.makedirs('../data/processed', exist_ok=True)
output_path = '../data/processed/lfs_youth_cleaned.csv'

df_final.to_csv(output_path, index=False)

print(f"\n{'='*60}")
print("DATA SAVED SUCCESSFULLY")
print(f"{'='*60}")
print(f"Output file: {output_path}")
print(f"Shape: {df_final.shape}")
print(f"Size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
print(f"\nColumns saved ({df_final.shape[1]}):")
for i, col in enumerate(df_final.columns, 1):
    print(f"  {i:2d}. {col}")

## 11. Save Summary Statistics

In [None]:
# Create summary statistics table
summary_stats = []

# Overall NEET rate
overall_neet = df_final['NEET'].mean() * 100
summary_stats.append({
    'Category': 'Overall',
    'Group': 'All Youth',
    'Total': df_final.shape[0],
    'NEET_Count': df_final['NEET'].sum(),
    'NEET_Rate': overall_neet
})

# By gender
if sex_col and sex_col in df_final.columns:
    for sex, group in df_final.groupby(sex_col):
        summary_stats.append({
            'Category': 'Gender',
            'Group': sex,
            'Total': len(group),
            'NEET_Count': group['NEET'].sum(),
            'NEET_Rate': group['NEET'].mean() * 100
        })

# By urban/rural
if urban_col and urban_col in df_final.columns:
    for area, group in df_final.groupby(urban_col):
        summary_stats.append({
            'Category': 'Area',
            'Group': area,
            'Total': len(group),
            'NEET_Count': group['NEET'].sum(),
            'NEET_Rate': group['NEET'].mean() * 100
        })

# By province (top 5)
if prov_col and prov_col in df_final.columns:
    for prov, group in df_final.groupby(prov_col):
        summary_stats.append({
            'Category': 'Province',
            'Group': prov,
            'Total': len(group),
            'NEET_Count': group['NEET'].sum(),
            'NEET_Rate': group['NEET'].mean() * 100
        })

summary_df = pd.DataFrame(summary_stats)

# Save to CSV
os.makedirs('../outputs/tables', exist_ok=True)
summary_df.to_csv('../outputs/tables/neet_distribution.csv', index=False)

print("\n✓ Summary statistics saved to: outputs/tables/neet_distribution.csv")
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)
display(summary_df)

## 12. Data Quality Checks

In [None]:
print("\n" + "="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Check for missing values
missing = df_final.isnull().sum()
missing_pct = (missing / len(df_final)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Pct': missing_pct
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print("\nColumns with missing values:")
    display(missing_df)
else:
    print("\n✓ No missing values found!")

# Check NEET label distribution
print("\nNEET label distribution:")
print(df_final['NEET'].value_counts())

# Check for duplicates
n_duplicates = df_final.duplicated().sum()
print(f"\nDuplicate rows: {n_duplicates}")

# Data types
print("\nData types:")
print(df_final.dtypes.value_counts())

print("\n" + "="*60)
print("✓ DATA PREPROCESSING COMPLETE!")
print("="*60)
print("\nNext step: Run notebooks/03_Modeling_and_Explainability.ipynb")