# Hands-On Data Analysis Tutorial for Beginners: Titanic Dataset

**Introduction:**
This Jupyter Notebook provides a practical introduction to data analysis using Python with Pandas,
covering data cleaning, visualization, and feature engineering. It's designed for beginners and
should take approximately 2 hours to complete.

**Prerequisites:**
* Basic Python knowledge
* Jupyter Notebook environment set up
* Install required libraries: pandas, matplotlib, seaborn, and scikit-learn

In [None]:
!pip install pandas matplotlib seaborn scikit-learn numpy

## 1. Pandas Basics

In [17]:
# Import required libraries
import pandas as pd
import numpy as np

# for Box-Cox Transformation
from scipy import stats

# for min_max scaling
from mlxtend.preprocessing import minmax_scaling

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

#for feature engineering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load the Titanic dataset
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
titanic_data = pd.read_csv(url)

In [None]:
# Display the first few rows
print(titanic_data.head())

In [None]:
# Get a summary of the data
print(titanic_data.info())

In [None]:
# Descriptive statistics
print(titanic_data.describe())

In [None]:
# Data Selection and Filtering
# Select a single column
ages = titanic_data['Age']
print(ages.head())

In [None]:
# Select multiple columns
subset = titanic_data[['Name', 'Sex', 'Age', 'Fare', 'Survived']]
print(subset.head())

In [None]:
# Filter rows based on a condition
survived = titanic_data[titanic_data['Survived'] == 1]
print(survived.head())

## 2. Data Cleaning

In [None]:
# Load the Titanic dataset
penguin_data = pd.read_csv("/content/penguins_lter.csv")


In [None]:
# Check for missing values
print(penguin_data.isnull().sum())

In [10]:
# Handle missing values
# For 'Culmen Length (mm)', we'll use the median
penguin_data['Culmen Length (mm)'] = penguin_data['Culmen Length (mm)'].fillna(penguin_data['Culmen Length (mm)'].median())

# For 'Culmen Depth (mm)', we'll use the median
penguin_data['Culmen Depth (mm)'] = penguin_data['Culmen Depth (mm)'].fillna(penguin_data['Culmen Depth (mm)'].median())

# For 'Flipper Length (mm)', we'll use the mode
penguin_data['Flipper Length (mm)'] = penguin_data['Flipper Length (mm)'].fillna(penguin_data['Flipper Length (mm)'].mode())

# For 'Body Mass (g)', we'll use the mode
penguin_data['Body Mass (g)'] = penguin_data['Body Mass (g)'].fillna(penguin_data['Body Mass (g)'].mode())

# For 'Delta 15 N (o/oo)', we'll use the median
penguin_data['Delta 15 N (o/oo)'] = penguin_data['Delta 15 N (o/oo)'].fillna(penguin_data['Delta 15 N (o/oo)'].median())

# For 'Delta 13 C (o/oo)', we'll use the median
penguin_data['Delta 13 C (o/oo)'] = penguin_data['Delta 13 C (o/oo)'].fillna(penguin_data['Delta 13 C (o/oo)'].median())

In [None]:
# Check for missing values
print(penguin_data.isnull().sum())

In [13]:
# For 'Sex ', we'll write 'N/A'
penguin_data['Sex'] = penguin_data['Sex'].fillna("N/A")

# For 'Comments', we'll write 'No comments'
penguin_data['Comments'] = penguin_data['Comments'].fillna("No Comments")


In [None]:
# Check for missing values
print(penguin_data.isnull().sum())

In [None]:
# Check for missing values
print(titanic_data.isnull().sum())

In [None]:
# Data Type Conversion
# Convert 'Survived' to string for easier interpretation
titanic_data['Survived'] = titanic_data['Survived'].map({0: 'No', 1: 'Yes'})

In [None]:
# Handle Duplicates
# Check for duplicate rows
print(titanic_data.duplicated().sum())

In [None]:
# Remove duplicate rows if any
titanic_data = titanic_data.drop_duplicates()

## 3. Data Visualization

In [None]:
# Set up the plotting style
sns.set_style("whitegrid")

In [None]:
# Survival Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=titanic_data)
plt.title('Survival Distribution')
plt.show()

In [None]:
# Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(titanic_data['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
# Survival by Sex
plt.figure(figsize=(8, 6))
sns.countplot(x='Sex', hue='Survived', data=titanic_data)
plt.title('Survival by Sex')
plt.show()

In [None]:
# Survival by Passenger Class
plt.figure(figsize=(8, 6))
sns.countplot(x='Pclass', hue='Survived', data=titanic_data)
plt.title('Survival by Passenger Class')
plt.show()

In [None]:
# Correlation Heatmap
numeric_features = ['Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
plt.figure(figsize=(10, 8))
sns.heatmap(titanic_data[numeric_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## 4. Scaling and Normalization

In [18]:
# set seed for reproducibility
np.random.seed(0)

In [None]:
# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size=1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1], kde=True, legend=False)
ax[1].set_title("Scaled data")
plt.show()

In [None]:
# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax=plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(normalized_data[0], ax=ax[1], kde=True, legend=False)
ax[1].set_title("Normalized data")
plt.show()

## 5. Feature Engineering

In [None]:
# Create a new feature: Family Size
titanic_data['FamilySize'] = titanic_data['Siblings/Spouses Aboard'] + titanic_data['Parents/Children Aboard'] + 1

In [None]:
# Create age groups
titanic_data['AgeGroup'] = pd.cut(titanic_data['Age'], bins=[0, 18, 35, 50, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

In [None]:
# Extract title from Name
titanic_data['Title'] = titanic_data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [None]:
# Group rare titles
rare_titles = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
titanic_data['Title'] = titanic_data['Title'].replace(rare_titles, 'Rare')

In [None]:
# Print the first few rows to see the new features
print(titanic_data[['Name', 'FamilySize', 'AgeGroup', 'Title']].head())

## 6. Preparing Data for Machine Learning

In [None]:
# Select features for the model
features = ['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare', 'FamilySize', 'Title']
X = titanic_data[features]
y = titanic_data['Survived']

In [None]:
# Create preprocessing pipelines
numeric_features = ['Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Title']

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Fit the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

In [None]:
print("Shape of processed features:", X_processed.shape)

## Conclusion

In this tutorial, we've covered:
1. Loading and exploring data with Pandas
2. Cleaning data by handling missing values and duplicates
3. Visualizing data using Matplotlib and Seaborn
4. Scaling and Normalizing data
5. Feature engineering to create new informative features
6. Preparing data for machine learning

Next steps could include:\
6. Trying different visualization techniques\
7. Experimenting with more feature engineering ideas

## References
[Pandas](https://www.kaggle.com/learn/pandas) \\
[Data Cleaning](https://www.kaggle.com/learn/data-cleaning) \\
[Data Visualization](https://www.kaggle.com/learn/data-visualization) \\
[Feature Engineering](https://www.kaggle.com/learn/feature-engineering) \\
[Titanic Dataset](https://www.kaggle.com/c/titanic/data) \\
[Penguin Dataset](https://www.kaggle.com/code/parulpandey/penguin-dataset-the-new-iris/input?select=penguins_size.csv) \\

## Optional Extended Part: Advanced Visualizations and Feature Engineering

### 6. Advanced Visualization Techniques

#### 6.1 Pairplot for Multivariate Relationships
A pairplot is useful for exploring relationships between multiple numeric features.

In [None]:
plt.figure(figsize=(10, 8))
sns.pairplot(titanic_data, hue='Survived', vars=['Age', 'Fare', 'FamilySize'], palette='Set2')
plt.suptitle('Pairplot of Age, Fare, and Family Size by Survival', y=1.02)
plt.show()

#### 6.2 Violin Plot for Distribution and Categories
A violin plot shows the distribution of a numeric variable for different categories.

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(x='Pclass', y='Age', hue='Survived', data=titanic_data, split=True, palette='muted')
plt.title('Age Distribution by Passenger Class and Survival')
plt.show()

#### 6.3 Heatmap for Missing Data
Visualizing missing data can help identify patterns in missingness.

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(titanic_data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

#### 6.4 Boxen Plot for Outlier Detection
A boxen plot is an enhanced version of a box plot that shows more quantiles.

In [None]:
plt.figure(figsize=(8, 6))
sns.boxenplot(x='Pclass', y='Fare', data=titanic_data, palette='coolwarm')
plt.title('Fare Distribution by Passenger Class')
plt.show()

In [None]:
# #### 6.5 Survival Rate by Age Group (Bar Plot)
age_group_survival = titanic_data.groupby('AgeGroup')['Survived'].value_counts(normalize=True).unstack()
age_group_survival.plot(kind='bar', stacked=True, figsize=(10, 6), color=['red', 'green'])
plt.title('Survival Rate by Age Group')
plt.ylabel('Proportion')
plt.xlabel('Age Group')
plt.legend(['Did Not Survive', 'Survived'], title="Survival")
plt.show()

In [None]:
# #### 6.6 Fare Distribution by Embarked Port (Swarm Plot)
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Pclass', y='Fare', hue='Survived', data=titanic_data, palette='Set1')
plt.title('Fare Distribution by Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) and Survival')
plt.show()

### 7. Experimenting with More Feature Engineering Ideas

#### 7.1 Creating an IsAlone Feature
This feature indicates whether a passenger was traveling alone or with family.

In [None]:
titanic_data['IsAlone'] = (titanic_data['FamilySize'] == 1).astype(int)  # 1 if alone, 0 otherwise

In [None]:
print(titanic_data[['FamilySize', 'IsAlone']].head())

In [None]:
# Visualize survival rates based on whether passengers were alone or not
alone_survival = titanic_data.groupby('IsAlone')['Survived'].value_counts(normalize=True).unstack()
alone_survival.plot(kind='bar', stacked=True, figsize=(8, 6), color=['red', 'green'])
plt.title('Survival Rate Based on Traveling Alone or Not')
plt.ylabel('Proportion')
plt.xlabel('Is Alone (1 = Yes)')
plt.legend(['Did Not Survive', 'Survived'], title="Survival")
plt.show()

#### 7.2 Creating a Fare Per Person Feature
Divide the fare by the family size to get a normalized fare per person.

In [None]:
titanic_data['FarePerPerson'] = titanic_data['Fare'] / titanic_data['FamilySize']
print(titanic_data[['Fare', 'FamilySize', 'FarePerPerson']].head())

In [None]:
# Visualize Fare Per Person distribution
plt.figure(figsize=(10, 6))
sns.histplot(titanic_data['FarePerPerson'], kde=True)
plt.title('Distribution of Fare Per Person')
plt.xlabel('Fare Per Person')
plt.ylabel('Frequency')
plt.show()

#### 7.4 Combining Features into Interaction Terms
Combine Pclass and Sex into a single feature to capture interaction effects.

In [None]:
titanic_data['Pclass_Sex'] = titanic_data['Pclass'].astype(str) + '_' + titanic_data['Sex']
print(titanic_data[['Pclass', 'Sex', 'Pclass_Sex']].head())

In [None]:
# Visualize survival rates based on this new feature
pclass_sex_survival = titanic_data.groupby('Pclass_Sex')['Survived'].value_counts(normalize=True).unstack()
pclass_sex_survival.plot(kind='bar', stacked=True, figsize=(10, 6), color=['red', 'green'])
plt.title('Survival Rate by Pclass and Sex Combination')
plt.ylabel('Proportion')
plt.xlabel('Pclass_Sex')
plt.legend(['Did Not Survive', 'Survived'], title="Survival")
plt.show()

#### 7.5 Encoding Titles into Categories
Map titles into broader categories (e.g., Royalty, Military).

In [None]:
title_mapping = {
    "Mr": "Mr", "Miss": "Miss", "Mrs": "Mrs", "Master": "Master",
    "Dr": "Professional", "Rev": "Professional",
    "Col": "Military", "Major": "Military", "Capt": "Military",
    "Lady": "Royalty", "Countess": "Royalty", "Sir": "Royalty",
    "Jonkheer": "Royalty", "Don": "Royalty", "Dona": "Royalty"
}

In [None]:
titanic_data['TitleCategory'] = titanic_data['Title'].map(title_mapping).fillna("Other")
print(titanic_data[['Title', 'TitleCategory']].head())

In [None]:
# Visualize survival rates by title category
title_category_survival = titanic_data.groupby('TitleCategory')['Survived'].value_counts(normalize=True).unstack()
title_category_survival.plot(kind='bar', stacked=True, figsize=(10, 6), color=['red', 'green'])
plt.title('Survival Rate by Title Category')
plt.ylabel('Proportion')
plt.xlabel('Title Category')
plt.legend(['Did Not Survive', 'Survived'], title="Survival")
plt.show()