TASK 2

Perform data cleaning and exploratory data analysis (EDA) on a dataset of your choice, such as the Titanic dataset from Kaggle. Explore the relationships between variables and identify patterns and trends in the data.


In [None]:
#import nessasary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snn

In [None]:
#Load dataset
titanic_data=pd.read_csv("/content/train.csv")

In [None]:
titanic_data.head()

In [None]:
#Data description
titanic_data.describe()
titanic_data.info()
titanic_data.shape
titanic_data.columns
titanic_data.dtypes

In [None]:
print ("NULL VALUES COUNT:")
titanic_data.isnull().sum()

In [None]:
print(f'duplicate values count : {titanic_data.duplicated().sum()}')

In [None]:
# DATA CLEANING AND PREPROCESSING

# Create a copy of the dataset to avoid modifying original data
df_cleaned = titanic_data.copy()

# Drop columns with high missingness (Cabin) and non-predictive features (Ticket, Name)
columns_to_drop = ['Cabin', 'Ticket', 'Name']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

# Handle missing values - Age: median imputation (robust to outliers)
age_median = df_cleaned['Age'].median()
df_cleaned['Age'] = df_cleaned['Age'].fillna(age_median)

# Handle missing values - Embarked: mode imputation (categorical variable)
embarked_mode = df_cleaned['Embarked'].mode()[0]
df_cleaned['Embarked'] = df_cleaned['Embarked'].fillna(embarked_mode)

# Convert categorical features to category dtype for memory efficiency
categorical_features = ['Sex', 'Embarked', 'Pclass']
df_cleaned[categorical_features] = df_cleaned[categorical_features].astype('category')

# Validate cleaning process
print("\nPOST-CLEANING MISSING VALUE REPORT:")
print(df_cleaned.isnull().sum())
print(f"\nDataset shape: {df_cleaned.shape}")
print(f"\nData types after conversion:\n{df_cleaned.dtypes}")

In [None]:
# EXPLORATORY DATA ANALYSIS (UNIVARIATE)

# Set professional style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
colors = ["#3498db", "#e74c3c", "#2ecc71"]  # Professional color palette

In [None]:
# 1. Survival Distribution
plt.figure()
ax = sns.countplot(data=df_cleaned, x='Survived', palette=[colors[1], colors[0]])
plt.title('Passenger Survival Distribution', fontsize=14, fontweight='bold')
plt.xticks([0, 1], ['Perished', 'Survived'])
plt.xlabel('Survival Status')
plt.ylabel('Count')

# Add percentage annotations
total = len(df_cleaned)
for p in ax.patches:
    percentage = f'{100 * p.get_height()/total:.1f}%'
    x = p.get_x() + p.get_width()/2
    y = p.get_height() + 10
    ax.annotate(f'{p.get_height()}\n({percentage})', (x, y), ha='center')

plt.tight_layout()
plt.show()


In [None]:
# 2. Gender Distribution
plt.figure()
gender_counts = df_cleaned['Sex'].value_counts()
ax = sns.countplot(data=df_cleaned, x='Sex', palette=[colors[0], colors[2]])
plt.title('Passenger Gender Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Gender')
plt.ylabel('Count')

# Add percentage annotations
for i, p in enumerate(ax.patches):
    percentage = f'{100 * p.get_height()/total:.1f}%'
    ax.annotate(f'{gender_counts[i]}\n({percentage})',
                (p.get_x() + p.get_width()/2, p.get_height() + 10),
                ha='center')

plt.tight_layout()
plt.show()


In [None]:
# 3. Age Distribution
plt.figure()
sns.histplot(data=df_cleaned, x='Age', bins=30, kde=True, color=colors[0])
plt.title('Passenger Age Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Age (Years)')
plt.ylabel('Frequency')

# Add statistical markers
median_age = df_cleaned['Age'].median()
mean_age = df_cleaned['Age'].mean()
plt.axvline(median_age, color='r', linestyle='--', label=f'Median: {median_age:.1f}')
plt.axvline(mean_age, color='g', linestyle='-', label=f'Mean: {mean_age:.1f}')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# 4. Passenger Class Distribution
plt.figure()
class_counts = df_cleaned['Pclass'].value_counts().sort_index()
ax = sns.countplot(data=df_cleaned, x='Pclass', palette=colors, order=[1, 2, 3])
plt.title('Passenger Class Distribution', fontsize=14, fontweight='bold')
plt.xticks([0, 1, 2], ['First Class', 'Second Class', 'Third Class'])
plt.xlabel('Ticket Class')
plt.ylabel('Count')

# Add percentage annotations
for i, p in enumerate(ax.patches):
    percentage = f'{100 * p.get_height()/total:.1f}%'
    ax.annotate(f'{class_counts[i+1]}\n({percentage})',
                (p.get_x() + p.get_width()/2, p.get_height() + 10),
                ha='center')

plt.tight_layout()
plt.show()


In [None]:
# 5. Fare Distribution
plt.figure()
sns.histplot(data=df_cleaned, x='Fare', bins=30, kde=True, color=colors[0])
plt.title('Ticket Fare Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Fare (Currency Units)')
plt.ylabel('Frequency')

# Add statistical markers
median_fare = df_cleaned['Fare'].median()
plt.axvline(median_fare, color='r', linestyle='--', label=f'Median: {median_fare:.1f}')
plt.xlim(0, 300)  # Limit x-axis to remove extreme outliers
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# 6. Embarkation Port Distribution
plt.figure()
port_counts = df_cleaned['Embarked'].value_counts()
ax = sns.countplot(data=df_cleaned, x='Embarked', palette=colors,
                  order=['C', 'Q', 'S'])
plt.title('Embarkation Port Distribution', fontsize=14, fontweight='bold')
plt.xticks([0, 1, 2], ['Cherbourg', 'Queenstown', 'Southampton'])
plt.xlabel('Port of Embarkation')
plt.ylabel('Count')

# Add percentage annotations
for i, p in enumerate(ax.patches):
    percentage = f'{100 * p.get_height()/total:.1f}%'
    ax.annotate(f'{port_counts.values[i]}\n({percentage})',
                (p.get_x() + p.get_width()/2, p.get_height() + 10),
                ha='center')

plt.tight_layout()
plt.show()

In [None]:
# BIVARIATE ANALYSIS - RELATIONSHIPS WITH SURVIVAL

# Set up professional styling
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("pastel")
plt.style.use('seaborn-whitegrid')


In [None]:
# 1. Survival by Gender - Enhanced with percentages
plt.figure()
ax = sns.countplot(data=df_cleaned, x='Sex', hue='Survived',
                  hue_order=[0, 1])
plt.title('Survival Distribution by Gender', fontsize=16, fontweight='bold')
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Survival Status', labels=['Perished', 'Survived'])

# Add percentage annotations
for p in ax.patches:
    height = p.get_height()
    width = p.get_width()/2
    x = p.get_x() + width
    y = height + 10
    percentage = f'{height/len(df_cleaned)*100:.1f}%'
    ax.annotate(percentage, (x, y), ha='center', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# 2. Survival by Passenger Class - Professional visualization
plt.figure()
ax = sns.countplot(data=df_cleaned, x='Pclass', hue='Survived',
                  order=[1, 2, 3], hue_order=[0, 1])
plt.title('Survival Distribution by Passenger Class', fontsize=16, fontweight='bold')
plt.xlabel('Passenger Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks([0, 1, 2], ['First Class', 'Second Class', 'Third Class'])
plt.legend(title='Survival Status', labels=['Perished', 'Survived'])

# Add percentage annotations
for p in ax.patches:
    height = p.get_height()
    width = p.get_width()/2
    x = p.get_x() + width
    y = height + 10
    percentage = f'{height/len(df_cleaned)*100:.1f}%'
    ax.annotate(percentage, (x, y), ha='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# 3. Age vs Survival - Enhanced visualization
plt.figure(figsize=(10, 6))
sns.violinplot(data=df_cleaned, x='Survived', y='Age',
              inner='quartile', split=True, palette='Set2')
plt.title('Age Distribution by Survival Status', fontsize=16, fontweight='bold')
plt.xlabel('Survival Status', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.xticks([0, 1], ['Perished', 'Survived'])

# Add statistical annotation
median_age_survived = df_cleaned[df_cleaned['Survived'] == 1]['Age'].median()
median_age_perished = df_cleaned[df_cleaned['Survived'] == 0]['Age'].median()
plt.text(0.5, 80, f"Median Age (Perished): {median_age_perished:.1f}\nMedian Age (Survived): {median_age_survived:.1f}",
         bbox=dict(facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

In [None]:
# 4. Survival by Embarkation Port
plt.figure()
sns.countplot(data=df_cleaned, x='Embarked', hue='Survived',
             order=['C', 'Q', 'S'], hue_order=[0, 1])
plt.title('Survival Distribution by Embarkation Port', fontsize=16, fontweight='bold')
plt.xlabel('Port of Embarkation', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks([0, 1, 2], ['Cherbourg', 'Queenstown', 'Southampton'])
plt.legend(title='Survival Status', labels=['Perished', 'Survived'])
plt.tight_layout()
plt.show()

In [None]:
# 5. CORRELATION ANALYSIS - ENHANCED

# Create correlation matrix
corr_matrix = df_cleaned.corr(numeric_only=True)

# Mask for upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
            fmt=".2f", linewidths=0.5, vmin=-1, vmax=1,
            annot_kws={"size": 12})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.xticks(fontsize=10, rotation=45)
plt.yticks(fontsize=10, rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# 6. GROUP ANALYSIS - SURVIVAL RATES

#  Survival Rate by Gender and Class
plt.figure(figsize=(10, 6))
survival_rates = df_cleaned.groupby(['Sex', 'Pclass'])['Survived'].mean().reset_index()
sns.barplot(data=survival_rates, x='Sex', y='Survived', hue='Pclass',
            palette='viridis', hue_order=[1, 2, 3])
plt.title('Survival Rate by Gender and Passenger Class', fontsize=16, fontweight='bold')
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Survival Rate', fontsize=12)
plt.ylim(0, 1)
plt.legend(title='Passenger Class', labels=['First', 'Second', 'Third'])
plt.tight_layout()
plt.show()



**✅ Conclusion from Titanic Dataset EDA**



After cleaning and exploring the Titanic dataset, several meaningful insights were discovered that highlight patterns in passenger survival:

🚹 Gender Impact:
Females had a significantly higher survival rate than males.

Survival rate among women was especially high in first and second class.

🛳️ Passenger Class (Pclass):
1st class passengers had the highest survival rates, followed by 2nd, then 3rd.

3rd class passengers were least likely to survive, regardless of gender.

🧒 Age Factor:
Age distribution showed that children under 10 had slightly better survival chances.

However, there wasn't a strict linear relationship between age and survival.

📊 Correlations:
Sex and Pclass were the strongest predictors of survival.

Other features like age and number of siblings/spouses aboard had moderate influence.

🌍 Embarked Port:
Most passengers embarked from Southampton, and survival rates slightly varied by port.

Passengers from Cherbourg (C) had better survival odds (likely due to class/gender mix).

📌 Final Insights:
Women and children in higher classes were prioritized during evacuation.

Being male and in 3rd class significantly reduced survival chances.

Strong patterns exist between demographic features (Sex, Age, Class) and survival outcome, which can guide predictive modeling.
