In [None]:
# Determine best number of epochs from early stopping
best_epoch_count = early_stopping.stopped_epoch - early_stopping.patience + 1
print(f'Retraining on full dataset for {best_epoch_count} epochs...')

# Retrain on full dataset
final_model = create_model()
final_model.fit(X, y, epochs=best_epoch_count, batch_size=32, verbose=0)

# Predictions
predictions = final_model.predict(X_test_submission)
predicted_classes = (predictions > 0.5).astype(int)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predicted_classes.flatten()
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved successfully as 'submission.csv'")
print(submission_df.head())

## 7. Submission Preparation

Train on the full dataset for the optimal number of epochs (found during validation) and generate predictions for the test set.

In [None]:
val_loss = history.history['val_loss']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
accuracy = history.history['accuracy']

# Plot Loss
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(loss, label='Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(accuracy, label='Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()

# Save the plot for README
if not os.path.exists('images'):
    os.makedirs('images')
plt.savefig('images/loss_plot.png')
plt.show()

best_epoch = np.argmax(val_accuracy)
print(f'Best Validation Accuracy: {max(val_accuracy):.4f} at epoch {best_epoch}')

## 6. Evaluation and Visualization

Plotting the training history (Loss and Accuracy).

In [None]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train model
model = create_model()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=10,
                                                  restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    epochs=150, # Increased epochs slightly, early stopping will handle it
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

## 5. Training

We split the data into training and validation sets (80/20).
We use Early Stopping to prevent overfitting.

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(6,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary output
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

model = create_model()
model.summary()

## 4. Model Definition

We define a sequential Keras model with:
- Input layer (6 features)
- Dense layers with ReLU activation and Dropout for regularization
- Output layer with Sigmoid activation (for binary classification)

In [None]:
# Preprocessing function
def preprocess_data(df, is_training=True):
    features = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']].copy()
    
    # Fill missing Age with median
    features['Age'] = features['Age'].fillna(features['Age'].median())
    
    # Handle missing Fare in test set if any
    features['Fare'] = features['Fare'].fillna(features['Fare'].median())

    # Normalize features to <-1, 1>
    scaler = MinMaxScaler(feature_range=(-1, 1))
    features[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']] = scaler.fit_transform(features[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']])

    # Encode Sex category: 'female' -> -1, 'male' -> 1
    label_encoder = LabelEncoder()
    features['Sex'] = label_encoder.fit_transform(features['Sex']) * 2 - 1 

    if is_training:
        labels = df['Survived'].copy()
        return features, labels
    else:
        return features

# Process the data
features, labels = preprocess_data(train_df, is_training=True)
features_test = preprocess_data(test_df, is_training=False)

# Convert to NumPy arrays
X = features.to_numpy()
y = labels.to_numpy()
X_test_submission = features_test.to_numpy().astype(float32)

print("Data preprocessed and converted to NumPy arrays.")

## 3. Data Preprocessing

We need to predict survival.
We fill missing Age values with the median.
Features are normalized to the range <-1, 1>.
Sex is encoded as -1 (female) and 1 (male).

In [None]:
# 1. Import data
def load_data():
    # Adjust paths if necessary based on your environment
    # Kaggle environment usually puts data in /kaggle/input/
    # Local environment might differ. 
    # Using the path downloaded by kagglehub if available, or fallback to local
    
    possible_paths = [
        '/kaggle/input/titanic-dataset/train.csv',
        'train.csv', # Local fallback
    ]
    
    train_path = None
    for p in possible_paths:
        if os.path.exists(p):
            train_path = p
            break
            
    # If using kagglehub, the path might be in the cache
    if train_path is None:
        try:
            # Try to find where kagglehub put it if run previously
            import kagglehub
            path = kagglehub.dataset_download('wiktorabka/titanic-dataset')
            train_path = os.path.join(path, 'train.csv')
            test_path = os.path.join(path, 'test.csv')
            return pd.read_csv(train_path), pd.read_csv(test_path)
        except:
             pass

    # Hardcoded fallback for the notebook context
    if train_path is None: 
         # Assuming standard kaggle paths for now as in original code
         train_df = pd.read_csv('/kaggle/input/titanic-dataset/train.csv')
         test_df = pd.read_csv('/kaggle/input/titanic-dataset/test.csv')
         return train_df, test_df
         
    return pd.read_csv(train_path), pd.read_csv(test_path.replace('train.csv', 'test.csv'))

try:
    train_df, test_df = load_data()
    print("Data loaded successfully")
    print(train_df.head())
except Exception as e:
    print(f"Error loading data: {e}")
    print("Make sure to run the kagglehub download cell above or place train.csv/test.csv in the directory.")

## 2. Load Data

Load the training and test datasets.
Ensure you have the data downloaded (via kagglehub or manually placed).

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from numpy import float32
import os

print("Imports successful")

<a href="https://colab.research.google.com/github/Fortland2018/ML-Projects/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic - Machine Learning from Disaster

This notebook demonstrates a solution to the classic Titanic survival prediction problem.
We will use a Neural Network to predict passenger survival based on features like Age, Sex, Ticket Class, and Fare.

## 1. Imports and Setup