Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

Data Collection and Processing

In [None]:
titanic_data = pd.read_csv(r"titanic.csv")

In [None]:
titanic_data.head()

In [None]:
titanic_data.tail()

In [None]:
titanic_data.shape

In [None]:
titanic_data.info()

In [None]:
titanic_data.describe()

In [None]:
titanic_data.isnull().sum()

Finding Data Correlations

In [None]:
titanic_data.corr(numeric_only = True)

In [None]:
sns.heatmap(data = titanic_data.corr(numeric_only=True))

Handling The Missing Values

In [None]:
#We have analysed that Age and Cabin are empty
#Cabin is mostly empty so we will drop it
titanic_data.drop(columns='Cabin', axis=1,inplace=True)

In [None]:
titanic_data.head()

In [None]:
titanic_data['Pclass']

In [None]:
#Age is also empty in some rows so we will fill it with average age in perspective of P Class
titanic_data['Age'].fillna(0, inplace=True) # all NaN will be replaced by 0

#some variables for extra calculation
count_1 = 0
count_2 = 0
count_3 = 0

sum_1 = 0
sum_2 = 0
sum_3 = 0

#Calculating the average age of different P class
for i in range(len(titanic_data['Age'])):
    if (titanic_data['Pclass'][i]==1 and titanic_data['Age'][i]!=0):
        count_1+=1
        sum_1 += titanic_data['Age'][i]
    if (titanic_data['Pclass'][i]==2 and titanic_data['Age'][i]!=0):
        count_2+=1
        sum_2 += titanic_data['Age'][i]
    if (titanic_data['Pclass'][i]==3 and titanic_data['Age'][i]!=0):
        count_3+=1
        sum_3 += titanic_data['Age'][i]

av_1 = int(sum_1/count_1)
av_2 = int(sum_2/count_2)
av_3 = int(sum_3/count_3)

#Now we will replace the 0 in age with the respective average age value
titanic_data.loc[(titanic_data['Pclass']==1) & (titanic_data['Age']==0), 'Age']=av_1
titanic_data.loc[(titanic_data['Pclass']==2) & (titanic_data['Age']==0), 'Age']=av_2
titanic_data.loc[(titanic_data['Pclass']==3) & (titanic_data['Age']==0), 'Age']=av_3




In [None]:
#Embarked is empty at two places so we will fill with mode
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

In [None]:
titanic_data.isnull().sum()

Data Analysis

In [None]:
# Some Statistical data measures
titanic_data.describe()

In [None]:
# Finding the number of survived and not survived people
titanic_data['Survived'].value_counts()
# 0 means survived and 1 means not survived

In [None]:
# Finding the number of male and female
titanic_data['Sex'].value_counts()

Data Visualization

In [None]:
neon_palette = [
    "#F5D300",  # laser yellow
    "#FF5F1F",   # blaze orange
    "#FE53BB",  # hot magenta
    "#6F2CF3",  # violet pulse
    "#08F7FE",  # electric cyan

    "#00FF41"  # matrix green


]

sns.set_theme(
    style="dark",         
    palette=neon_palette,
    context="talk"        
)

plt.rcParams.update({
    "figure.facecolor":  "#000000",   # whole canvas
    "axes.facecolor":    "#000000",   # plot panel
    "axes.edgecolor":    "#444444",
    "grid.color":        "#333333",
    "text.color":        "#FFFFFF",
    "axes.labelcolor":   "#FFFFFF",
    "xtick.color":       "#FFFFFF",
    "ytick.color":       "#FFFFFF"
})

# OPTIONAL: make grid lines dotted & subtle
#plt.rcParams["grid.linestyle"] = ":"
#plt.rcParams["grid.alpha"]     = 0.25


In [None]:
# making count plot for "Survived" column
sns.countplot(x ='Survived', data=titanic_data)

In [None]:
# making count plot for sex column
sns.countplot(x='Sex', data=titanic_data)

In [None]:
#number of survivors gender wise
sns.countplot(x = 'Sex', hue = 'Survived', data=titanic_data)

In [None]:
#making countplot for Pclass column
sns.countplot(x = 'Pclass', data = titanic_data)

In [None]:
# number of survivors on "Pclass" wise
sns.countplot(x='Pclass', hue='Survived', data=titanic_data)

Encoding The Categorical Columns

In [None]:
# number of male and female
titanic_data['Sex'].value_counts()

In [None]:
# number of S, C, Q in "Embarked" column
titanic_data['Embarked'].value_counts()

In [None]:
#Converting the categorical columns
titanic_data.replace({'Sex': {'male':0, 'female':1}, 'Embarked':{'S':0, 'C':1, 'Q':2}}, inplace = True)

In [None]:
# get the top 5 values
titanic_data.head()

Separating Features and Targets

In [None]:
X = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis= 1)
Y = titanic_data['Survived']

In [None]:
X

In [None]:
Y

Splitting the data into training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

### Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# Training the Logistic Regression Model with the training data
model.fit(X_train, Y_train)

### Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)

In [None]:
X_train_prediction

In [None]:
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy of the training data is ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)

In [None]:
X_test_prediction

In [None]:
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy of the test data is ', test_data_accuracy)

Training the model by RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(random_state=2)


In [None]:
rf_model.fit(X_train, Y_train)


In [None]:
# On training data
train_predictions = rf_model.predict(X_train)

# On test data
test_predictions = rf_model.predict(X_test)


In [None]:
train_accuracy = accuracy_score(Y_train, train_predictions)
test_accuracy = accuracy_score(Y_test, test_predictions)

print("Training Accuracy (Random Forest):", train_accuracy)
print("Testing Accuracy (Random Forest):", test_accuracy)


In [None]:
feature_names = X.columns
importances = rf_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8, 6))
plt.title("Feature Importances - Random Forest")
plt.barh(range(len(indices)), importances[indices], color="#08F7FE", edgecolor="white")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.grid(False)
plt.show()

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, Y_train)
log_train_acc = accuracy_score(Y_train, log_model.predict(X_train))
log_test_acc  = accuracy_score(Y_test,  log_model.predict(X_test))


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
dt_model = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=2)
dt_model.fit(X_train, Y_train)
dt_train_acc = accuracy_score(Y_train, dt_model.predict(X_train))
dt_test_acc  = accuracy_score(Y_test,  dt_model.predict(X_test))


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=2)
rf_model.fit(X_train, Y_train)
rf_train_acc = accuracy_score(Y_train, rf_model.predict(X_train))
rf_test_acc  = accuracy_score(Y_test,  rf_model.predict(X_test))


In [None]:
print(f"Logistic Regression: train={log_train_acc:.3f}, test={log_test_acc:.3f}")
print(f"Decision Tree:       train={dt_train_acc:.3f}, test={dt_test_acc:.3f}")
print(f"Random Forest:       train={rf_train_acc:.3f}, test={rf_test_acc:.3f}")
