# 01 - Exploratory Data Analysis (EDA)
# NEET Predictor - LFS 2020-21

**Objective**: Understand the structure and patterns in the Labour Force Survey 2020-21 data, with focus on youth (15-24 years) and factors related to NEET status.

**Contents**:
1. Data Loading & Schema Inspection
2. Data Quality Assessment
3. Demographic Distributions
4. Geographic Patterns
5. Education & Employment Patterns
6. Pre-NEET Analysis (before labeling)
7. Correlation & Relationships
8. Summary Statistics & Key Insights

**Author**: Data Science Team  
**Date**: October 2025

In [None]:
# Import libraries
import sys
import os
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Core libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Custom modules
from data_preprocessing import load_raw, save_schema, detect_variable_names

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Data Loading & Schema Inspection

Load the LFS 2020-21 Stata file and inspect its structure.

In [None]:
# Load the data
data_path = Path.cwd().parent / 'data' / 'raw' / 'LFS2020-21.dta'

print(f"Loading data from: {data_path}")
print("=" * 70)

# Load data using custom function
df, metadata = load_raw(str(data_path), verbose=True)

# Save schema
schema_path = Path.cwd().parent / 'data' / 'raw' / 'schema.txt'
save_schema(metadata, str(schema_path))

print(f"\n✓ Data loaded successfully!")
print(f"  Shape: {df.shape}")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
print("=" * 70)
display(df.head())

# Basic info
print("\nDataset Information:")
print("=" * 70)
df.info()

## 2. Data Quality Assessment

Check for missing values, duplicates, and data quality issues.

In [None]:
# Missing value analysis
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
missing_df = pd.DataFrame({
    'Column': missing_pct.index,
    'Missing %': missing_pct.values
}).head(20)

print("Top 20 Columns with Missing Values:")
print("=" * 70)
print(missing_df.to_string(index=False))

# Plot missing values
fig, ax = plt.subplots(figsize=(10, 6))
missing_df_plot = missing_df[missing_df['Missing %'] > 0]
if len(missing_df_plot) > 0:
    ax.barh(missing_df_plot['Column'], missing_df_plot['Missing %'])
    ax.set_xlabel('Missing Percentage (%)')
    ax.set_title('Top Columns with Missing Values', fontsize=14, fontweight='bold')
    ax.invert_yaxis()
    plt.tight_layout()
    
    # Save plot
    output_dir = Path.cwd().parent / 'outputs' / 'eda_plots'
    output_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(output_dir / 'missing_values.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ Plot saved to {output_dir / 'missing_values.png'}")
    plt.show()
else:
    print("\n✓ No missing values found!")