In [None]:
# 1_data_prep.py

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Config
DATA_PATH = "data/bisnode_firms_clean.csv"
SEED = 42

# Load data
print("Loading data...")
data = pd.read_csv(DATA_PATH)
print(f"Loaded dataset with shape: {data.shape}")

# Drop rows with missing target
data = data.dropna(subset=["fast_growth"])

# Target
data["fast_growth"] = data["fast_growth"].astype(int)

# Feature groups
numerical_features = [
    "sales_mil_log", "sales_mil_log_sq", "age", "age2", 
    "growth_1y", "growth_1y_sq", "ceo_age", "foreign"
]
categorical_features = ["ind2_cat", "urban_m", "gender_m", "m_region_loc"]
binary_features = ["new", "ceo_young", "foreign_management"]

# Select and prepare features
X = data[numerical_features + categorical_features + binary_features].copy()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
X = X.fillna(X.mean())
y = data["fast_growth"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# Save splits for reuse
X_train.to_csv("data/X_train.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

print("Data preprocessing complete. Splits saved to data/")