In [None]:
pip show pdpbox


In [None]:
pip install pdpbox==0.2.0


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import pdpbox
from pdpbox import pdp, get_dataset, info_plots
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
# Read In and Explore the Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Data Analysis
print(train_data.info())

# Display summary statistics for numerical features
print(train_data.describe())

# Display the first few rows of the data
print(train_data.head())

In [None]:
# Data Visualization
# Visualize survival by gender
plt.figure(figsize=(8, 6))
sns.countplot(x='Sex', hue='Survived', data=train_data)
plt.title('Survival by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

# Visualize survival by passenger class (Pclass)
plt.figure(figsize=(8, 6))
sns.countplot(x='Pclass', hue='Survived', data=train_data)
plt.title('Survival by Passenger Class')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

# Visualize survival by age
plt.figure(figsize=(8, 6))
sns.histplot(data=train_data, x='Age', hue='Survived', kde=True)
plt.title('Survival by Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

# Visualize survival by the number of siblings/spouses (SibSp)
plt.figure(figsize=(8, 6))
sns.countplot(x='SibSp', hue='Survived', data=train_data)
plt.title('Survival by Siblings/Spouses')
plt.xlabel('SibSp')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

# Visualize survival by the number of parents/children (Parch)
plt.figure(figsize=(8, 6))
sns.countplot(x='Parch', hue='Survived', data=train_data)
plt.title('Survival by Parents/Children')
plt.xlabel('Parch')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()



In [None]:
# Cleaning Data
# Fill missing values in the 'Age' column with the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

In [None]:
# One-hot encode the 'Sex' column for both train and test data
train_data = pd.get_dummies(train_data, columns=['Sex'], prefix=['Sex'])
test_data = pd.get_dummies(test_data, columns=['Sex'], prefix=['Sex'])


In [None]:
# Features to include
features = ['Pclass', 'SibSp', 'Parch', 'Age']

In [None]:
# Define features and target variable
X = train_data[features]
y = train_data['Survived']

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define the neural network model
model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),  # Input layer with the correct shape
    keras.layers.Dense(4, activation='relu'),  # Hidden layer with 64 units and ReLU activation
    keras.layers.Dense(32, activation='relu'),  # Additional hidden layer (optional)
    keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))


In [None]:
# Create partial dependence plots
# 'Age' partial dependence plot
age_pdp = pdp.pdp_isolate(model=model, dataset=X_val, model_features=X_val.columns, feature='Age')
pdp.pdp_plot(age_pdp, 'Age')
plt.show()

In [None]:
# 'Pclass' partial dependence plot
pclass_pdp = pdp.pdp_isolate(model=model, dataset=X_val, model_features=X_val.columns, feature='Pclass')
pdp.pdp_plot(pclass_pdp, 'Pclass')
plt.show()

In [None]:
# Get the weights of the first hidden layer
first_hidden_layer_weights = model.layers[1].get_weights()[0]

# Calculate feature importance based on the weights
feature_importance = np.mean(np.abs(first_hidden_layer_weights), axis=1)

# Get the feature names
feature_names = X.columns

# Sort and print feature importance
sorted_feature_importance = pd.Series(feature_importance, index=feature_names).sort_values(ascending=False)
print("Feature Importance:")
print(sorted_feature_importance)


In [None]:
# Plot training history (optional)
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training History')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_val)  # Replace X_val with your validation set features
y_pred_binary = (y_pred >= 0.5).astype(int)  # Convert probabilities to binary predictions
report = classification_report(y_val, y_pred_binary)
print(report)


In [None]:
# Make predictions on the test data
X_test = test_data[features]  # Assuming you have already defined 'features' for the test data
y_test_pred = model.predict(X_test)
y_test_pred_binary = (y_test_pred >= 0.5).astype(int)

In [None]:
# Compare the model's predictions to the actual values
comparison = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred_binary.ravel()})


In [None]:
# Analyze the factors contributing to correct or incorrect predictions
correct_predictions = comparison[comparison['Actual'] == comparison['Predicted']]
incorrect_predictions = comparison[comparison['Actual'] != comparison['Predicted']]


In [None]:
# Create a DataFrame with PassengerId and Survived columns
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": y_test_pred_binary.ravel()  # Reshape to 1D array
})


In [None]:
# Save the DataFrame to a CSV file
submission.to_csv("titanic_submission.csv", index=False)