# The goal is to predict which passengers survived the Titanic shipwreck.

## Variable Notes

- **pclass:** A proxy for socio-economic status (SES)
  - 1st = Upper
  - 2nd = Middle
  - 3rd = Lower

- **age:** Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5.

- **sibsp:** The dataset defines family relations in this way...
  - Sibling = brother, sister, stepbrother, stepsister
  - Spouse = husband, wife (mistresses and fiancés were ignored)

- **parch:** The dataset defines family relations in this way...
  - Parent = mother, father
  - Child = daughter, son, stepdaughter, stepson
  - Some children traveled only with a nanny, therefore parch=0 for them.


In [1]:
# Let's take a look at the data

import pandas as pd

file_path = '/kaggle/input/titanic/train.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to inspect the data
print(df.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [2]:
df.info()
# identify missing values
missing_column_values_df = df.loc[:, df.isnull().any()]
print(missing_column_values_df.columns, "\n", missing_column_values_df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Index(['Age', 'Cabin', 'Embarked'], dtype='object') 
 Age         float64
Cabin        object
Embarked     object
dtype: object


In [3]:
df["Dependents"] = df["SibSp"] + df["Parch"]

print(f'Passengers who travelled without family members: {df[df["Dependents"] == 0].shape[0]}')
print(f'Passengers who travelled with family members: {df[df["Dependents"] > 0].shape[0]}')

Passengers who travelled without family members: 537
Passengers who travelled with family members: 354


In [29]:
# Calculate the percentage of male and female passengers
male_percentage = (df['Sex'] == 'male').sum() / len(df) * 100
female_percentage = (df['Sex'] == 'female').sum() / len(df) * 100

# Display the percentages
print(f"Percentage of male passengers: {male_percentage:.2f}%")
print(f"Percentage of female passengers: {female_percentage:.2f}%")


Percentage of male passengers: 64.76%
Percentage of female passengers: 35.24%


In [27]:
# Split the dataset
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Sex'])


In [30]:
# Calculate the percentage of male and female passengers
male_percentage_train = (X_train['Sex'] == 'male').sum() / len(X_train) * 100
female_percentage_train = (X_train['Sex'] == 'female').sum() / len(X_train) * 100

male_percentage_test = (X_train['Sex'] == 'male').sum() / len(X_train) * 100
female_percentage_test = (X_train['Sex'] == 'female').sum() / len(X_train) * 100

# Display the percentages
print(f"Percentage of male passengers train: {male_percentage_train:.2f}%")
print(f"Percentage of female passengers train: {female_percentage_train:.2f}%")
print(f"Percentage of male passengers test: {male_percentage_test:.2f}%")
print(f"Percentage of female passengers test: {female_percentage_test:.2f}%")


Percentage of male passengers train: 64.75%
Percentage of female passengers train: 35.25%
Percentage of male passengers test: 64.75%
Percentage of female passengers test: 35.25%


In [17]:
# preprocesing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
import numpy as np

# Define the features and target column
features = df.columns[df.columns != 'Survived']
target = 'Survived'

num_features = make_column_selector(dtype_include=np.number)(df[features])
cat_features = make_column_selector(dtype_include=object)(df[features])


num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encode", OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = make_column_transformer(
    (num_pipeline, num_features),
    (cat_pipeline, cat_features),
)

In [34]:
df_processed = preprocessing.fit_transform(df)

In [35]:
print(df_processed.shape)
print(df.shape)

(891, 1731)
(891, 13)


# Correlation Summary

The correlation matrix provides insights into the relationships between different features and the target variable 'Survived'. Here are some key correlations:

### Positive Correlations with Survival:

- Being Female (pipeline-2__Sex_female): 0.692025
  - This suggests a strong positive correlation between being female and survival. Female passengers were more likely to survive.

- Higher Fare (pipeline-1__Fare): 0.414994
  - Passengers who paid higher fares had a positive correlation with survival, indicating a potential association between fare and survival.

- Embarked at Cherbourg (pipeline-2__Embarked_C): 0.286461
  - Passengers who embarked at Cherbourg had a positive correlation with survival.

### Negative Correlations with Survival:

- Lower Passenger Class (pipeline-1__Pclass): -0.472895
  - There is a negative correlation with passenger class, indicating that lower-class passengers were less likely to survive.

- Being Male (pipeline-2__Sex_male): -0.692025
  - This strong negative correlation suggests that being male is associated with a lower likelihood of survival.

### Other Correlations:

- Parch (pipeline-1__Parch): 0.163889
  - A positive correlation with survival, but not as strong as being female or having a higher fare.

- Embarked at Southampton (pipeline-2__Embarked_S): -0.244726
  - Negative correlation with survival. Passengers who embarked at Southampton had a lower chance of survival.

- Cabin B96 B98 (pipeline-2__Cabin_B96 B98): -0.460145
  - Negative correlation with survival. Passengers with this cabin had a lower likelihood of survival.

- Ticket 3101295 (pipeline-2__Ticket_3101295): -0.093152
  - Negative correlation with survival, but not as strong.

These correlations provide valuable insights into the factors influencing survival on the Titanic.


In [36]:
# Convert the sparse matrix to a dense Pandas DataFrame with correct column names
df_processed_dense = pd.DataFrame(df_processed.toarray(), columns=preprocessing.get_feature_names_out())

# Add the 'Survived' column to the processed DataFrame
df_processed_dense['Survived'] = df['Survived']

# Compute the correlation matrix for the processed data
corr_matrix_processed = df_processed_dense.corr()

# Coorelation
corr_matrix = corr_matrix_processed.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)


Survived                      1.000000
pipeline-2__Sex_female        0.692025
pipeline-1__Fare              0.414994
pipeline-2__Embarked_C        0.286461
pipeline-1__Parch             0.163889
                                ...   
pipeline-2__Ticket_3101295   -0.093152
pipeline-2__Embarked_S       -0.244726
pipeline-2__Cabin_B96 B98    -0.460145
pipeline-1__Pclass           -0.472895
pipeline-2__Sex_male         -0.692025
Name: Survived, Length: 1732, dtype: float64

In [None]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__splitter': ['best', 'random'],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt', 'log2', None]
}

# Decision tree pipeline with RandomizedSearchCV
tree_model = Pipeline([
    ("preprocessing", preprocessing),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

# Set up RandomizedSearchCV on the pipeline
random_search = RandomizedSearchCV(tree_model, param_distributions=param_grid, n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42)

# Perform the search on the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_tree_model = random_search.best_estimator_

Best Hyperparameters: {'classifier__splitter': 'best', 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__max_depth': 30, 'classifier__criterion': 'entropy'}

In [42]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Evaluate the best model on the test set
y_pred = best_tree_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_tree = accuracy_score(y_test, y_pred)
print(f'Best Decision Tree Accuracy: {accuracy_tree:.2f}')

# Display additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Additional metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display additional metrics
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Best Decision Tree Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       114
           1       0.76      0.80      0.78        65

    accuracy                           0.84       179
   macro avg       0.82      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179


Confusion Matrix:
[[98 16]
 [13 52]]

Precision: 0.7647058823529411
Recall: 0.8
F1 Score: 0.7819548872180452
