In [3]:
import sys
sys.path.append('../')
from src.preprocessing import TitanicPreprocessor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Example usage
# Load processed data
train = pd.read_csv('../data/processed/train_processed.csv')
test = pd.read_csv('../data/processed/test_processed.csv')

# Split features and target
X = train.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
y = train['Survived']

# Create train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and use preprocessor
preprocessor = TitanicPreprocessor()

# Correct usage: fit_transform on training, transform on validation
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

print("Processed shapes:")
print("X_train:", X_train_processed.shape)
print("X_val:", X_val_processed.shape)

# Save preprocessed data
np.save('../data/processed/X_train_processed.npy', X_train_processed)
np.save('../data/processed/X_val_processed.npy', X_val_processed)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_val.npy', y_val)

Processed shapes:
X_train: (712, 24)
X_val: (179, 24)


In [5]:
# Validate preprocessing
print('Processed training data shape: ', X_train_processed.shape)
print('Processed validation data shape: ', X_val_processed.shape)

# Check for missing values
print('Missing values in processed data:')
print('Training: ', np.isnan(X_train_processed).sum())
print('Validation: ', np.isnan(X_val_processed).sum())

Processed training data shape:  (712, 24)
Processed validation data shape:  (179, 24)
Missing values in processed data:
Training:  0
Validation:  0
