# The goal is to predict which passengers survived the Titanic shipwreck.

## Variable Notes

- **pclass:** A proxy for socio-economic status (SES)
  - 1st = Upper
  - 2nd = Middle
  - 3rd = Lower

- **age:** Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5.

- **sibsp:** The dataset defines family relations in this way...
  - Sibling = brother, sister, stepbrother, stepsister
  - Spouse = husband, wife (mistresses and fiancés were ignored)

- **parch:** The dataset defines family relations in this way...
  - Parent = mother, father
  - Child = daughter, son, stepdaughter, stepson
  - Some children traveled only with a nanny, therefore parch=0 for them.


In [None]:
# Let's take a look at the data

import pandas as pd

file_path = '/kaggle/input/titanic/train.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to inspect the data
print(df.head(10))

In [None]:
df.info()
# identify missing values
missing_column_values_df = df.loc[:, df.isnull().any()]
print(missing_column_values_df.columns, "\n", missing_column_values_df.dtypes)

In [None]:
# Calculate the percentage of male and female passengers
male_percentage = (df['Sex'] == 'male').sum() / len(df) * 100
female_percentage = (df['Sex'] == 'female').sum() / len(df) * 100

# Display the percentages
print(f"Percentage of male passengers: {male_percentage:.2f}%")
print(f"Percentage of female passengers: {female_percentage:.2f}%")


In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Sex'])


In [None]:
# Calculate the percentage of male and female passengers
male_percentage_train = (X_train['Sex'] == 'male').sum() / len(X_train) * 100
female_percentage_train = (X_train['Sex'] == 'female').sum() / len(X_train) * 100

male_percentage_test = (X_train['Sex'] == 'male').sum() / len(X_train) * 100
female_percentage_test = (X_train['Sex'] == 'female').sum() / len(X_train) * 100

# Display the percentages
print(f"Percentage of male passengers train: {male_percentage_train:.2f}%")
print(f"Percentage of female passengers train: {female_percentage_train:.2f}%")
print(f"Percentage of male passengers test: {male_percentage_test:.2f}%")
print(f"Percentage of female passengers test: {female_percentage_test:.2f}%")


In [None]:
# preprocesing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Define the features and target column
features = np.setdiff1d(df.columns, ['Survived'])
target = 'Survived'

num_features = make_column_selector(dtype_include="number")(df[features])
cat_features = make_column_selector(dtype_include=object)(df[features])

# Custom transformer to drop specified columns
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not set(self.columns_to_drop).issubset(X.columns):
            missing_columns = list(set(self.columns_to_drop) - set(X.columns))
            raise ValueError(f"Columns {missing_columns} not found in the DataFrame")

        # Drop the specified columns
        X_transformed = X.drop(columns=self.columns_to_drop)
        return X_transformed

    def get_feature_names_out(self, input_features=None):
        # Exclude the dropped columns from the input feature names
        return [col for col in input_features if col not in self.columns_to_drop]

# Custom transformer to combine two columns
class ColumnCombiner(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_combine, new_column_name):
        self.columns_to_combine = columns_to_combine
        self.new_column_name = new_column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Combine specified columns and create a new one
        X[self.new_column_name] = X[self.columns_to_combine[0]] + X[self.columns_to_combine[1]]
        return X

    def get_feature_names_out(self, input_features=None):
        # Return the input feature names and the name of the new column
        return input_features + [self.new_column_name]


# Custom transformer to apply log
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_log):
        self.columns_to_log = columns_to_log
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Convert X to DataFrame if it's not already
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_to_log)

        X_copy = X.copy()
        # Apply log transformation to specified columns
        for column in self.columns_to_log:
            X_copy[column] = np.log1p(X_copy[column])

        return X_copy

    def get_feature_names_out(self, input_features=None):
        # Return the input feature names and the names of the log-transformed columns
        return input_features + [f"{column}_log" for column in self.columns_to_log]

class DenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.toarray() if hasattr(X, 'toarray') else X

    def get_feature_names_out(self, input_features=None):
        return input_features

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("log_transform", LogTransformer(columns_to_log=num_features)),
    ("standardize", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encode", OneHotEncoder(handle_unknown="ignore")),
    ("to_dense", DenseTransformer())
])
preprocessing = make_column_transformer(
    (ColumnDropper(columns_to_drop=['Name', "Ticket"]), ['Name', "Ticket"]),
    (ColumnCombiner(columns_to_combine=['SibSp', 'Parch'], new_column_name='Dependents'), ['SibSp', 'Parch']),
    (num_pipeline, num_features),
    (cat_pipeline, cat_features),
    remainder='passthrough'
)

In [None]:
df_processed = preprocessing.fit_transform(X_train)
print(df_processed.shape)
print(df.shape)

In [None]:
# Pipeline
preprocessing

# Correlation Summary

The correlation matrix provides insights into the relationships between different features and the target variable 'Survived'. Here are some key correlations:

### Positive Correlations with Survival:

- Being Female (pipeline-2__Sex_female): 0.692025
  - This suggests a strong positive correlation between being female and survival. Female passengers were more likely to survive.

- Higher Fare (pipeline-1__Fare): 0.414994
  - Passengers who paid higher fares had a positive correlation with survival, indicating a potential association between fare and survival.

- Embarked at Cherbourg (pipeline-2__Embarked_C): 0.286461
  - Passengers who embarked at Cherbourg had a positive correlation with survival.

### Negative Correlations with Survival:

- Lower Passenger Class (pipeline-1__Pclass): -0.472895
  - There is a negative correlation with passenger class, indicating that lower-class passengers were less likely to survive.

- Being Male (pipeline-2__Sex_male): -0.692025
  - This strong negative correlation suggests that being male is associated with a lower likelihood of survival.

### Other Correlations:

- Parch (pipeline-1__Parch): 0.163889
  - A positive correlation with survival, but not as strong as being female or having a higher fare.

- Embarked at Southampton (pipeline-2__Embarked_S): -0.244726
  - Negative correlation with survival. Passengers who embarked at Southampton had a lower chance of survival.

- Cabin B96 B98 (pipeline-2__Cabin_B96 B98): -0.460145
  - Negative correlation with survival. Passengers with this cabin had a lower likelihood of survival.

- Ticket 3101295 (pipeline-2__Ticket_3101295): -0.093152
  - Negative correlation with survival, but not as strong.

These correlations provide valuable insights into the factors influencing survival on the Titanic.


In [None]:
# need to fix get feature names out 

# Convert the sparse matrix to a dense Pandas DataFrame with correct column names
df_processed_dense = pd.DataFrame(df_processed)

# # Add the 'Survived' column to the processed DataFrame
df_processed_dense['Survived'] = df['Survived']

# # Compute the correlation matrix for the processed data
corr_matrix_processed = df_processed_dense.corr()

# # Coorelation
corr_matrix = corr_matrix_processed.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)


In [None]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
param_grid = {
#     'classifier__criterion': ['gini', 'entropy'],
    'classifier__criterion': ['entropy'],
#     'classifier__splitter': ['best', 'random'],
        'classifier__splitter': ['best'],
#     'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__max_depth': [30,],
#     'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_split': [5,],
#     'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_samples_leaf': [1,],
#     'classifier__max_features': ['auto', 'sqrt', 'log2', None],
    'classifier__max_features': [None],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1, 0.2, 0.5, 1.0]
}

# Decision tree pipeline with RandomizedSearchCV
tree_model = Pipeline([
    ("preprocessing", preprocessing),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

# Set up RandomizedSearchCV on the pipeline
random_search = RandomizedSearchCV(tree_model, param_distributions=param_grid, n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42)

# Perform the search on the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_tree_model = random_search.best_estimator_

Best Hyperparameters: {'classifier__splitter': 'best', 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__max_depth': 30, 'classifier__criterion': 'entropy'}

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Evaluate the best model on the test set
y_pred = best_tree_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_tree = accuracy_score(y_test, y_pred)
print(f'Best Decision Tree Accuracy: {accuracy_tree:.2f}')

# Display additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Additional metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display additional metrics
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
# Let's take a look at the data

import pandas as pd

file_path = '/kaggle/input/titanic/test.csv'

# Load the CSV file into a Pandas DataFrame
test_df = pd.read_csv(file_path)

In [None]:
test_df.head(5)

In [None]:
# Evaluate the best model on the test set
test_predictions = best_tree_model.predict(test_df)

In [None]:
# Create a DataFrame for the submission
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

# Save the DataFrame to a CSV file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)