<a href="https://www.kaggle.com/code/sairoshinikandregula/titanic-survival-rate-prediction?scriptVersionId=211852415" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

***The Titanic Survival Rate Prediction project blends historical data with modern machine learning algorithms to predict the likelihood of survival for passengers aboard the Titanic. By following a systematic process from data exploration to model optimization, this project demonstrates the power of data science in extracting actionable insights from complex datasets.***


****I have crafted this project overview for your ease with sub divisions out above:****

1. Data Import and Setup
2. Data Exploration and Insights
3. Feature Refinement
4. Model Training and Optimization
5. Result Generation


****1. DATA IMPORT AND SETUP****

In [None]:
import warnings

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("/kaggle/input/titanic-dataset/train.csv")
test = pd.read_csv("/kaggle/input/titanic-dataset/test.csv")
train.shape


In [None]:
train.info()
train.isnull().sum()

****2. DATA EXPLORATION AND INSIGHTS****

In [None]:
from matplotlib import cm
from matplotlib.colors import Normalize
import numpy as np

# Unique Pie Chart with Gradient Effect
f, ax = plt.subplots(1, 2, figsize=(16, 6))

# Gradient colors for the pie chart
survival_counts = train['Survived'].value_counts()
labels = survival_counts.index
sizes = survival_counts.values
norm = Normalize(vmin=min(sizes), vmax=max(sizes))
colors = cm.coolwarm(norm(sizes))  # Gradient color map

# Pie chart customization
wedges, texts, autotexts = ax[0].pie(
    sizes,
    explode=[0.05, 0.1],  # Create separation for more effect
    labels=['Not Survived', 'Survived'], 
    autopct='%1.1f%%',
    shadow=True,
    startangle=140,
    colors=colors,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

# Styling the annotations
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(14)
for text in texts:
    text.set_fontsize(12)

# Adding a radial annotation
ax[0].set_title('Survival Distribution with Gradient Colors', fontsize=14)
ax[0].annotate(
    'Survival Status',
    xy=(0, 0),
    ha='center',
    va='center',
    fontsize=16,
    fontweight='bold',
    color='black',
    bbox=dict(boxstyle='circle', facecolor='white', edgecolor='gray', pad=0.3)
)

# Bar plot for context
sns.barplot(
    x=labels, 
    y=sizes, 
    ax=ax[1], 
    palette=cm.coolwarm(norm(sizes))
)
ax[1].set_title('Survival Counts', fontsize=14)
ax[1].set_xlabel('Survived (0 = No, 1 = Yes)', fontsize=12)
ax[1].set_ylabel('Count', fontsize=12)
ax[1].bar_label(ax[1].containers[0], fmt='%d', fontsize=12, padding=3)

# Overall layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assume `train` is your DataFrame
f, ax = plt.subplots(1, 2, figsize=(14, 6))

# Bar Chart: Mean Survival by Sex
survival_by_sex = train[['Sex', 'Survived']].groupby(['Sex']).mean()
colors = ['#FF6F61', '#6A5ACD']  # Custom colors for bars
survival_by_sex.plot.bar(
    ax=ax[0], 
    color=colors, 
    edgecolor='black', 
    alpha=0.8, 
    legend=False
)
ax[0].set_title('Mean Survival Rate by Sex', fontsize=14, fontweight='bold')
ax[0].set_ylabel('Mean Survival Rate', fontsize=12)
ax[0].set_xlabel('Sex', fontsize=12)
ax[0].grid(axis='y', linestyle='--', alpha=0.6)

# Annotate bar chart
for p in ax[0].patches:
    ax[0].annotate(
        f'{p.get_height():.2f}',
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha='center',
        va='bottom',
        fontsize=11,
        fontweight='bold',
        color='black'
    )

# Countplot: Survival Distribution by Sex
sns.countplot(
    x='Sex', 
    hue='Survived', 
    data=train, 
    ax=ax[1], 
    palette=['#FF6F61', '#6A5ACD']
)
ax[1].set_title('Survival Counts by Sex', fontsize=14, fontweight='bold')
ax[1].set_ylabel('Count', fontsize=12)
ax[1].set_xlabel('Sex', fontsize=12)

# Annotate countplot
for container in ax[1].containers:
    ax[1].bar_label(
        container, 
        fmt='%d', 
        fontsize=11, 
        fontweight='bold'
    )

# Add a legend with better styling
ax[1].legend(
    title='Survival Status', 
    labels=['Deceased (0)', 'Survived (1)'], 
    loc='upper right', 
    fontsize=12, 
    title_fontsize=12,
    frameon=True, 
    shadow=True, 
    borderpad=1
)

# Enhance overall layout
plt.tight_layout()
plt.subplots_adjust(wspace=0.3)
plt.show()


****3. Feature Refinement****

In [None]:
train = train.drop(['Cabin'], axis=1)

test = test.drop(['Cabin'], axis=1)

train = train.drop(['Ticket'], axis=1)

test = test.drop(['Ticket'], axis=1)

train = train.fillna({"Embarked": "S"})

In [None]:
# sort the ages into logical categories

train["Age"] = train["Age"].fillna(-0.5)

test["Age"] = test["Age"].fillna(-0.5)

bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]

labels = ['Unknown', 'Baby', 'Child', 'Teenager',

          'Student', 'Young Adult', 'Adult', 'Senior']

train['AgeGroup'] = pd.cut(train["Age"], bins, labels=labels)

test['AgeGroup'] = pd.cut(test["Age"], bins, labels=labels)

In [None]:
# create a combined group of both datasets

combine = [train, test]



# extract a title for each Name in the

# train and test datasets

for dataset in combine:

    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)



pd.crosstab(train['Title'], train['Sex'])



# replace various titles with more common names

for dataset in combine:

    dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',

                                                 'Don', 'Dr', 'Major',

                                                 'Rev', 'Jonkheer', 'Dona'],

                                                'Rare')



    dataset['Title'] = dataset['Title'].replace(

        ['Countess', 'Lady', 'Sir'], 'Royal')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')

    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')

    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')



train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()



# map each of the title groups to a numerical value

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3,

                 "Master": 4, "Royal": 5, "Rare": 6}

for dataset in combine:

    dataset['Title'] = dataset['Title'].map(title_mapping)

    dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
mr_age = train[train["Title"] == 1]["AgeGroup"].mode()  # Young Adult

miss_age = train[train["Title"] == 2]["AgeGroup"].mode()  # Student

mrs_age = train[train["Title"] == 3]["AgeGroup"].mode()  # Adult

master_age = train[train["Title"] == 4]["AgeGroup"].mode()  # Baby

royal_age = train[train["Title"] == 5]["AgeGroup"].mode()  # Adult

rare_age = train[train["Title"] == 6]["AgeGroup"].mode()  # Adult



age_title_mapping = {1: "Young Adult", 2: "Student",

                     3: "Adult", 4: "Baby", 5: "Adult", 6: "Adult"}



for x in range(len(train["AgeGroup"])):

    if train["AgeGroup"][x] == "Unknown":

        train["AgeGroup"][x] = age_title_mapping[train["Title"][x]]



for x in range(len(test["AgeGroup"])):

    if test["AgeGroup"][x] == "Unknown":

        test["AgeGroup"][x] = age_title_mapping[test["Title"][x]]

In [None]:
# map each Age value to a numerical value

age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3,

               'Student': 4, 'Young Adult': 5, 'Adult': 6,

               'Senior': 7}

train['AgeGroup'] = train['AgeGroup'].map(age_mapping)

test['AgeGroup'] = test['AgeGroup'].map(age_mapping)



train.head()



# dropping the Age feature for now, might change

train = train.drop(['Age'], axis=1)

test = test.drop(['Age'], axis=1)

In [None]:
train = train.drop(['Name'], axis=1)

test = test.drop(['Name'], axis=1)

In [None]:
sex_mapping = {"male": 0, "female": 1}

train['Sex'] = train['Sex'].map(sex_mapping)

test['Sex'] = test['Sex'].map(sex_mapping)



embarked_mapping = {"S": 1, "C": 2, "Q": 3}

train['Embarked'] = train['Embarked'].map(embarked_mapping)

test['Embarked'] = test['Embarked'].map(embarked_mapping)

In [None]:
for x in range(len(test["Fare"])):

    if pd.isnull(test["Fare"][x]):

        pclass = test["Pclass"][x]  # Pclass = 3

        test["Fare"][x] = round(

            train[train["Pclass"] == pclass]["Fare"].mean(), 4)



# map Fare values into groups of

# numerical values

train['FareBand'] = pd.qcut(train['Fare'], 4,

                            labels=[1, 2, 3, 4])

test['FareBand'] = pd.qcut(test['Fare'], 4,

                           labels=[1, 2, 3, 4])



# drop Fare values

train = train.drop(['Fare'], axis=1)

test = test.drop(['Fare'], axis=1)

****4. MODEL TRAINING AND OPTIMIZATION****

In [None]:
from sklearn.model_selection import train_test_split



# Drop the Survived and PassengerId

# column from the trainset

predictors = train.drop(['Survived', 'PassengerId'], axis=1)

target = train["Survived"]

x_train, x_val, y_train, y_val = train_test_split(

    predictors, target, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score



randomforest = RandomForestClassifier()



# Fit the training data along with its output

randomforest.fit(x_train, y_train)

y_pred = randomforest.predict(x_val)



# Find the accuracy score of the model

acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)

print(acc_randomforest)

****5. RESULT GENERATION****

In [33]:
# Re-create any derived features used during training
test['AgeGroup'] = pd.cut(test['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])
test['FareBand'] = pd.cut(test['Fare'], bins=[0, 50, 100, 150, 200, 1000], labels=['Low', 'Medium', 'High', 'Very High', 'Elite'])
test['Title'] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Ensure all features in test match those used during training
test = test[['Pclass', 'Sex', 'AgeGroup', 'FareBand', 'Title']]  # Make sure these columns match the train features
test = pd.get_dummies(test)  # One-hot encoding if needed (like for 'Sex', 'AgeGroup', etc.)

# Now you can make predictions
ids = test['PassengerId']
predictions = randomforest.predict(test.drop('PassengerId', axis=1))


KeyError: 'PassengerId'