In [8]:
import sys
sys.path.append('../')
from src.preprocessing import TitanicPreprocessor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Example usage
# Load processed data
train = pd.read_csv('../data/processed/train_processed.csv')
test = pd.read_csv('../data/processed/test_processed.csv')

# Split features and target
X = train.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
y = train['Survived']

# Drop features for test dataset
X_test = test.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

# Create train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and use preprocessor
preprocessor = TitanicPreprocessor()

# Correct usage: fit_transform on training, transform on validation and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

print("Processed shapes:")
print("X_train: ", X_train_processed.shape)
print("X_val: ", X_val_processed.shape)
print("X_test: ", X_test_processed.shape)

# Check for missing values
print('Missing values in processed data:')
print('Training: ', np.isnan(X_train_processed).sum())
print('Validation: ', np.isnan(X_val_processed).sum())
print('Test: ', np.isnan(X_val_processed).sum())

Processed shapes:
X_train:  (712, 24)
X_val:  (179, 24)
X_test:  (418, 24)
Missing values in processed data:
Training:  0
Validation:  0
Test:  0


In [9]:
# Save preprocessed data
np.save('../data/processed/X_train_processed.npy', X_train_processed)
np.save('../data/processed/X_val_processed.npy', X_val_processed)
np.save('../data/processed/X_test_processed.npy', X_test_processed)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_val.npy', y_val)