# Project 6 Custom EDA
Beth Spornitz
September 23, 2024

This code uses a combination of Python and Markdown to create an initial data story in a Jupyter Notebook. The project includes a project virtual environment with popular libraries for data analytics including pandas, matplotlib, and seaborn, and introduces a common process for starting exploratory data analysis projects.

Specific to the titanic dataset, this file will walk through how to perform these functions:
1.  Data Acquisition
2.  Initial Data Inspection
3.  Initial Descriptive Statistics
4.  Initial Data Distribution for Numerical Columns
5.  Initial Data Distribution for Categorical Columns
6.  Initial Data Preparation (or Data Preprocessing)
7.  Initial Visualizations
8.  Initial Storytelling and Presentation

These steps will be followed by visulations and explanations of visualizations.

### Import Dependencies

In [501]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

### Data Acquisition

In [None]:
# Load the Titanic dataset into DataFrame
df = sns.load_dataset('titanic')

# Inspect first rows of the DataFrame
print(df.head())

### Initial Data Inspection

In [None]:
print(df.head(10))
print(df.shape)
print(df.dtypes)

### Initial Descriptive Statistics

In [None]:
print(df.describe())

### Initial Data Distribution for Numerical Columns

#### Pair Plot for Age, Fare, and Survived

In [None]:
sns.pairplot(df[['age', 'fare', 'survived']])
plt.show()

#### Density Plots
This first density plot is based on age alone.

In [None]:
sns.kdeplot(df['age'], fill=True)
plt.title('Age Density on the Titanic')
plt.xlabel('age')
plt.ylabel('Density')
plt.show()

##### Observations
The age density plot is showing that most passengers were between the ages of 20 and 40.

This density plot combined age with passenger class.

In [None]:
# Create a density plot for Age by Passenger Class
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='age', hue='pclass', fill=True, common_norm=False, alpha=0.5, palette='crest')

# Customize the plot
plt.title('Density Plot of Age by Passenger Class')
plt.xlabel('Age')
plt.ylabel('Density')
plt.xlim(0, df['age'].max())  # Limit x-axis to the max age in the dataset

plt.show()

The age and class density plot is showing:  
-There were more 3rd class passengers than 1st or 2nd class, and there were more 2nd class passengers than there were 1st class passenger.  
-The 3rd class passengers were generally younger than those in 2nd class and 1st clss.  


#### Initial Data Distribution for Categorical Columns

In [None]:
# Inspect value counts for all categorical columns
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f'Value counts for column: {col}')
    print(df[col].value_counts())
    print()
    # Display count plot
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()

# Show all plots
plt.show()

##### Observations

#### Initial Data Transformation and Feature Engineering

In [None]:
# Define a dictionary mapping old column names to new names
new_column_names = {
    'pclass': 'Passenger Class',
    'sex': 'Sex',
    'age': 'Age',
    'sibsp': 'Siblings/Spouses Aboard',
    'parch': 'Parents/Children Aboard',
    'fare': 'Fare Amount',
    'survived': 'Survival Status'
}

# Rename the columns in the DataFrame using the dictionary
df.rename(columns=new_column_names, inplace=True)

# Display the updated DataFrame column names to verify the changes
print(df.columns)

# Adding new columns for family size and total fare
df['Family Size'] = df['Siblings/Spouses Aboard'] + df['Parents/Children Aboard'] + 1  # Include the passenger
df['Fare Per Person'] = df['Fare Amount'] / df['Family Size']

# Display the first few rows of the updated DataFrame to verify the changes
print(df.head())

#### Initial Data Visualization

In [None]:
# Set up the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Age', y='Fare Amount', hue='Survival Status', style='Sex', palette='deep', alpha=0.7)

# Customize the plot
plt.title('Scatter Plot of Age vs. Fare and Survival on the Titanic')
plt.xlabel('Age')
plt.ylabel('Fare Amount')
plt.legend(title='Survival Status')
plt.grid()

# Show the plot
plt.show()

#### Pair Plot

In [None]:
# Drop rows with missing values in relevant numeric columns
df = df.dropna(subset=['Age', 'Fare Amount', 'Passenger Class', 'Siblings/Spouses Aboard', 'Parents/Children Aboard'])

# Create a pair plot
sns.pairplot(df, hue='Survival Status', vars=['Age', 'Fare Amount', 'Passenger Class', 'Siblings/Spouses Aboard', 'Parents/Children Aboard'], palette='deep')

# Customize the plot
plt.suptitle('Pair Plot of Titanic Dataset', y=1.02)  # Adjust title position
plt.show()

##### Observations

#### Correlation Heatmap

In [None]:
# Select only numeric columns relevant for correlation analysis
numeric_cols = df[['Survival Status', 'Passenger Class', 'Age', 'Fare Amount']]

# Calculate the correlation matrix
correlation = numeric_cols.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix Heatmap for Titanic Dataset')
plt.show()

##### Observations

### Bar Plot

In [None]:
# Calculate survival rates by class and sex
survival_counts = df.groupby(['Passenger Class', 'Sex', 'Survival Status']).size().unstack(fill_value=0)
print(survival_counts)

# Calculate the total count of passengers in each group
total_counts = survival_counts.sum(axis=1)

# Calculate survival proportions
survival_proportions = survival_counts.div(total_counts, axis=0)
print(survival_proportions)

# Create a stacked bar plot
ax = survival_proportions.plot(kind='bar', stacked=True, color=['red', 'green'], figsize=(10, 6))

# Customize the plot
plt.title('Survival Rate by Passenger Class and Sex')
plt.xlabel('Passenger Class')
plt.ylabel('Proportion of Passengers')
plt.xticks(rotation=0)

# Set the legend with proper labels
plt.legend(title='Survived', labels=['No', 'Yes'], loc='upper right')

plt.show()

##### Observations

##### Bar Plot for Survival Counts by Passenger Class

In [None]:
# Create a contingency table
contingency_titanic = df.groupby(['Passenger Class', 'Survival Status']).size().unstack()

# Rename the index to use "Survived" and "Dead"
contingency_titanic.index = ['1st Class', '2nd Class', '3rd Class']  # Rename classes
contingency_titanic.columns = ['Dead', 'Survived']  # Rename 0 and 1

# Create a stacked bar plot
titanic_barplot = contingency_titanic.plot.bar(stacked=True, 
                                               color=["lightblue", "lightpink"])

plt.ylabel("Counts")
plt.xlabel('Passenger Class')
plt.xticks(rotation=0)
plt.title('Survival Counts by Passenger Class')
plt.show()

#### Male vs. Female Survivors

In [None]:
# Filter for survivors
survivors = df[df['survived'] == 1]

# Calculate the counts of males and females among survivors
gender_counts = survivors['sex'].value_counts()

# Convert the Series to a DataFrame for easier plotting
gender_percentages = (gender_counts / gender_counts.sum() * 100).reset_index()
gender_percentages.columns = ['Gender', 'Percentage']

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x='Gender', hue='Gender', y='Percentage', data=gender_percentages, palette='pastel')

# Customize the plot
plt.title('Percentage of Survivors by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage (%)')

# Add percentage labels on top of the bars
for index, value in enumerate(gender_percentages['Percentage']):
    plt.text(index, value + 1, f'{value:.1f}%', ha='center')

plt.ylim(0, 100)  # Set y-axis limit to 100%
plt.show()

#### Count Plot

In [None]:
# Create a count plot
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='class', hue='Survival Status', data=df, palette='pastel')

# Customize the plot
plt.title('Count of Passengers by Class and Survival Status')
plt.xlabel('Passenger Class')
plt.ylabel('Count')

# Rename legend labels
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ['Dead', 'Alive'], title='Survival Status')

plt.show()

##### Observation

#### Box Plot of Fare by Class and Survival
A box plot can help show the distribution of fares paid by passengers in different classes, highlighting survival status

0: Represents Did Not Survive (Dead)
1: Represents Survived (Alive)

'pclass': 'Passenger Class',
    'sex': 'Gender',
    'age': 'Age',
    'sibsp': 'Siblings/Spouses Aboard',
    'parch': 'Parents/Children Aboard',
    'fare': 'Fare Amount',
    'survived': 'Survival Status'

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='Fare Amount', hue='Survival Status', data=df, palette='pastel')
plt.title('Box Plot of Fare by Passenger Class and Survival')
plt.xlabel('Passenger Class')
plt.ylabel('Fare')
plt.legend(title='Survived')
plt.show()

#### Violin Plot of Age by Class and Survival

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='class', y='Age', hue='Survival Status', data=df, split=True, palette='pastel')
plt.title('Violin Plot of Age by Passenger Class and Survival')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.legend(title='Survived')
plt.show()

#### FacetGrid of Survival by Age and Class
Key Features of the FacetGrid
Separate Plots for Each Class:

Each column in the grid represents a different passenger class (1st, 2nd, and 3rd class).
This allows for easy comparison between classes.
Age Distribution:

Each plot shows the age distribution of passengers using a histogram or kernel density estimate (KDE).
This helps to visualize how ages are distributed within each class.
Survival Status:

The data is differentiated by survival status (e.g., Survived vs. Did Not Survive).
Typically, different colors are used to represent the two groups, making it easy to see how many survived and how many did not.
Insights You Can Derive
Age Trends:

You can observe whether certain age groups had higher survival rates in specific classes.
For example, you might find that younger passengers had higher survival rates in 1st class compared to older passengers in 3rd class.
Class Differences:

The grid allows you to see how survival chances varied not only by age but also by the class of travel.
This can highlight socioeconomic factors influencing survival (e.g., wealthier passengers in higher classes had better survival rates).
Survival Patterns:

By examining the distribution shapes (e.g., peaks, spread), you can identify potential patterns or anomalies in survival based on age and class.


In [None]:
g = sns.FacetGrid(df, col='class', hue='Survival Status', height=4, aspect=1)
g.map(sns.histplot, 'Age', bins=30, kde=True)
g.add_legend()
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Age Distribution by Passenger Class and Survival')
plt.show()

#### Analyze Survival of Mothers vs. Women Without Children

In [None]:
# Load the Titanic dataset
df = sns.load_dataset('titanic')

# Convert categorical variables to numeric
df['sex'] = df['sex'].map({'male': 1, 'female': 0})

# Check the data types after conversion
print(df.dtypes)

# Drop any non-numeric columns before calculating the correlation
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix using only numeric columns
correlation_matrix = df[numeric_cols].corr()

# Plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix[['survived']], cmap="RdBu_r", center=0.0, annot=True)
plt.title('Correlation Heatmap with Survival Status')
plt.show()

In [None]:
# Create a new column to identify mothers
df['is_mother'] = (df['sex'] == 'female') & (df['Parents/Children Aboard'] > 0)

# Create a new column to identify women without children
df['is_woman_without_children'] = (df['sex'] == 'female') & (df['Parents/Children Aboard'] == 0)

# Check counts of each group
num_mothers = df['is_mother'].sum()
num_women_without_children = df['is_woman_without_children'].sum()
print(f"Number of Mothers: {num_mothers}, Number of Women Without Children: {num_women_without_children}")

# Calculate survival rates
mothers_survival_rate = df[df['is_mother']]['Survival Status'].mean()
women_without_children_survival_rate = df[df['is_woman_without_children']]['Survival Status'].mean()

# Print survival rates
print(f"Survival Rate for Mothers: {mothers_survival_rate}")
print(f"Survival Rate for Women Without Children: {women_without_children_survival_rate}")

# Create a DataFrame for the survival statistics
survival_stats = pd.DataFrame({
    'Group': ['Mothers', 'Women without Children'],
    'Survival Rate': [mothers_survival_rate, women_without_children_survival_rate]
})

# Create a bar plot if we have valid data
if survival_stats['Survival Rate'].notnull().any():
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Group', y='Survival Rate', data=survival_stats, palette='pastel')

    # Customize the plot
    plt.title('Survival Rate: Mothers vs. Women Without Children')
    plt.xlabel('Group')
    plt.ylabel('Survival Rate')

    # Add percentage labels on top of the bars
    for index, value in enumerate(survival_stats['Survival Rate']):
        plt.text(index, value + 0.02, f'{value:.1%}', ha='center')

    plt.ylim(0, 1)  # Set y-axis limit to 1
    plt.show()
else:
    print("No valid survival rates to plot.")