In [7]:
#Import Necessary Packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#Load the dataset
df_titanic = pd.read_csv('titanic.csv')
df_titanic.head()

#Find Missing Values in the Dataframe
df_titanic.info()
df_titanic.isnull().sum() #Output shows there are missing values in Age, Cabin and Embarked columns.

#Strip '$' from the Fare column and convert it to float
df_titanic['Fare'] = df_titanic['Fare'].replace('[\$,]', '', regex=True).astype(float)

#Display the modified DataFrame
df_titanic.head()

#Plot Bar Chart For Perished vs Survived.Survived = 0 means Perished, Survived = 1 means Survived.
sns.set_style("whitegrid")
plt.figure(figsize=(6, 4))

#This counts the occurrences of each unique value in the "Survived" column
survival_counts = df_titanic['Survived'].value_counts()

#Create a bar plot
ax = sns.barplot(x=survival_counts.index, y=survival_counts.values)
plt.title('Survival Distribution')
plt.xlabel('(0: Perished, 1: Survived)')
plt.ylabel('Number of Passengers')

#Add labels for total count on top of the bars
for i, count in enumerate(survival_counts):
    ax.text(i, count + 10, str(count), ha='center', va='bottom')
sns.despine()
plt.show()

#Plot Bar Chart on Number of Male and Female Passengers
plt.figure(figsize=(8, 6))

#This creates a count plot for the "Sex" column
ax = sns.countplot(x='Sex', data=df_titanic, palette='Set2')
plt.title('Number of Male and Female Passengers')
plt.xlabel('Sex')
plt.ylabel('Number of Passengers')

#Add labels for total count on top of the bars with reduced height
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height() + 5),
                ha='center', va='bottom')
sns.despine()
plt.show()

#Groupby Sex to Find Survival Rate of Male and Female
#This groups by 'Sex' and calculates the mean of 'Survived' column
survival_rate_by_sex = df_titanic.groupby('Sex')['Survived'].mean()
print(survival_rate_by_sex)

#This calculates survival rate by sex
survival_rate_by_sex = df_titanic.groupby('Sex')['Survived'].mean() * 100

plt.figure(figsize=(8, 6))

#Create a bar plot for survival rate by sex
ax = sns.barplot(x=survival_rate_by_sex.index, y=survival_rate_by_sex.values, palette='Set2')
plt.title('Survival Rate by Sex')
plt.xlabel('Sex')
plt.ylabel('Survival Rate (%)')

#Add labels for total survival rate on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 2., p.get_height() + 1),
                ha='center', va='bottom')
sns.despine()
plt.show()

#Plot Perished vs Survived Barchart for Male and Female
plt.figure(figsize=(8, 6))

#This creates a count plot
ax = sns.countplot(x='Sex', hue='Survived', data=df_titanic, palette={0: 'lightcoral', 1: 'skyblue'})
plt.title('Perished vs. Survived by Sex')
plt.xlabel('Sex')
plt.ylabel('Number of Passengers')

#Add labels for total count on top of the bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height + 10),
                ha='center', va='bottom')
plt.legend(title='Legend', labels=['Perished', 'Survived'])
plt.show()

#This groups by 'Pclass' and calculates the mean of 'Survived' column
survival_rate_by_pclass = df_titanic.groupby('Pclass')['Survived'].mean()
print(survival_rate_by_pclass)

#This calculates survival rate by Pclass
survival_rate_by_pclass = df_titanic.groupby('Pclass')['Survived'].mean() * 100

plt.figure(figsize=(8, 6))

#Create a bar plot for survival rate by Pclass
ax = sns.barplot(x=survival_rate_by_pclass.index, y=survival_rate_by_pclass.values, palette='viridis')
plt.title('Survival Rate by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Survival Rate (%)')

#Add labels for total survival rate on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 2., p.get_height() + 1),
                ha='center', va='bottom')
sns.despine()
plt.show()

#Plot Perished vs Survived for each Pclass
plt.figure(figsize=(10, 6))

#This creates a count plot
ax = sns.countplot(x='Pclass', hue='Survived', data=df_titanic, palette={0: 'lightcoral', 1: 'skyblue'})
plt.title('Perished vs. Survived by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Number of Passengers')

#Add labels for total count on top of the bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height + 10),
                ha='center', va='bottom')
plt.legend(title='Survived', labels=['Perished', 'Survived'])
plt.show()
#Plot Histogram For Age
plt.figure(figsize=(10, 6))

#This creates a histogram for the 'Age' column
df_titanic['Age'].hist(bins=20, color='skyblue', edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

#Plot Histogram for Age, then filter out Survived passenger and plot histogram for Age on same axis. Set different color and label
plt.figure(figsize=(10, 6))

#Plot overall age histogram
df_titanic['Age'].hist(bins=20, color='skyblue', edgecolor='black', label='Overall Age Distribution')

#Filter DataFrame for survived passengers
survived_df = df_titanic[df_titanic['Survived'] == 1]

#Plot age histogram for survived passengers on the same axis
survived_df['Age'].hist(bins=20, color='lightcoral', edgecolor='black', label='Survived Age Distribution')
plt.title('Overall and Survived Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()