# ðŸŽ“ AI Bootcamp - Week 5 Day 4
## Feature Engineering: Complete Hands-on Guide

Today you'll learn:
- âœ… Categorical variable encoding
- âœ… Feature scaling techniques  
- âœ… PCA for dimensionality reduction
- âœ… Real Titanic dataset analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')
print('âœ… Ready to go!')

## Part 1: Categorical Encoding

ML models need numbers, not text! Let's convert categories.

In [None]:
# Example: Simple categorical data
colors = pd.Series(['Red', 'Blue', 'Green', 'Red', 'Blue'])

# Label Encoding
le = LabelEncoder()
colors_encoded = le.fit_transform(colors)
print('Label Encoded:', colors_encoded)

# One-Hot Encoding
colors_onehot = pd.get_dummies(colors, prefix='color')
print('\nOne-Hot Encoded:\n', colors_onehot)

## Part 2: Feature Scaling

Put all features on the same scale!

In [None]:
# Example data with different scales
data = pd.DataFrame({
    'Age': [25, 30, 35, 40],
    'Salary': [50000, 60000, 75000, 90000]
})

print('Original data:\n', data)

# Standardization (mean=0, std=1)
scaler = StandardScaler()
data_std = pd.DataFrame(
    scaler.fit_transform(data),
    columns=['Age_std', 'Salary_std']
)
print('\nStandardized:\n', data_std)

# Normalization (0-1 range)
normalizer = MinMaxScaler()
data_norm = pd.DataFrame(
    normalizer.fit_transform(data),
    columns=['Age_norm', 'Salary_norm']
)
print('\nNormalized:\n', data_norm)

## Part 3: PCA - Principal Component Analysis

Reduce dimensions while keeping variance!

In [None]:
# Generate sample high-dimensional data
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=200, n_features=10, 
                           n_informative=8, n_redundant=2, 
                           random_state=42)

print(f'Original shape: {X.shape}')

# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print(f'After PCA: {X_pca.shape}')
print(f'Variance explained: {pca.explained_variance_ratio_}')
print(f'Total variance: {sum(pca.explained_variance_ratio_):.2%}')

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA: 10D â†’ 2D')
plt.colorbar(label='Class')
plt.show()

## Part 4: TITANIC DATASET - Complete Example

Let's apply everything to the famous Titanic dataset!

In [None]:
# Load Titanic data
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

print(f'Shape: {df.shape}')
print(f'\nFirst few rows:')
df.head()

In [None]:
# Check missing values and data types
print('Missing values:\n', df.isnull().sum())
print('\nData types:\n', df.dtypes)

In [None]:
# Data preprocessing
# Select features
df_clean = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()

# Handle missing values
df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)
df_clean['Fare'].fillna(df_clean['Fare'].median(), inplace=True)
df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0], inplace=True)

# Encode categorical variables
df_clean['Sex'] = LabelEncoder().fit_transform(df_clean['Sex'])
df_clean['Embarked'] = LabelEncoder().fit_transform(df_clean['Embarked'])

print('Clean data:')
df_clean.head()

In [None]:
# Separate features and target
X = df_clean.drop('Survived', axis=1)
y = df_clean['Survived']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
# Standardize features (required for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print('Features after standardization:')
print(pd.DataFrame(X_scaled, columns=X.columns).describe())

In [None]:
# Apply PCA
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)

# Plot variance explained
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca_full.explained_variance_ratio_)+1), 
        pca_full.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('Variance by Component')

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca_full.explained_variance_ratio_)+1),
         np.cumsum(pca_full.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Cumulative Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% variance')
plt.legend()

plt.tight_layout()
plt.show()

print(f'Variance by component: {pca_full.explained_variance_ratio_}')
print(f'\nComponents needed for 95% variance: '
      f'{np.argmax(np.cumsum(pca_full.explained_variance_ratio_) >= 0.95) + 1}')

In [None]:
# Reduce to 2D for visualization
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 6))
survived = y == 1
plt.scatter(X_pca_2d[survived, 0], X_pca_2d[survived, 1], 
           c='green', label='Survived', alpha=0.6)
plt.scatter(X_pca_2d[~survived, 0], X_pca_2d[~survived, 1], 
           c='red', label='Did not survive', alpha=0.6)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Titanic Data: 7D â†’ 2D with PCA')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f'2D variance explained: {sum(pca_2d.explained_variance_ratio_):.2%}')

In [None]:
# Compare model performance: Original vs PCA
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Train on original features
model_original = LogisticRegression(max_iter=1000)
model_original.fit(X_train, y_train)
acc_original = accuracy_score(y_test, model_original.predict(X_test))

# Train on PCA features (5 components)
pca_5 = PCA(n_components=5)
X_train_pca = pca_5.fit_transform(X_train)
X_test_pca = pca_5.transform(X_test)

model_pca = LogisticRegression(max_iter=1000)
model_pca.fit(X_train_pca, y_train)
acc_pca = accuracy_score(y_test, model_pca.predict(X_test_pca))

print(f'Accuracy with {X_train.shape[1]} original features: {acc_original:.3f}')
print(f'Accuracy with 5 PCA components: {acc_pca:.3f}')
print(f'\nVariance kept with 5 components: {sum(pca_5.explained_variance_ratio_):.2%}')
print(f'\nâœ… Using {5/X_train.shape[1]:.1%} of features, '
      f'we kept {acc_pca/acc_original:.1%} of performance!')

## ðŸŽ¯ Your Challenge

Try the following:
1. Add more features from Titanic (Name length, Cabin, etc.)
2. Try different numbers of PCA components
3. Compare StandardScaler vs MinMaxScaler for PCA
4. Visualize feature importance before and after PCA

## ðŸ“š Summary

Today you learned:
- âœ… Label encoding for ordinal data
- âœ… One-hot encoding for nominal data
- âœ… Standardization & normalization
- âœ… PCA for dimensionality reduction
- âœ… Real-world application on Titanic!

**Key Takeaways:**
- Always standardize before PCA
- Check variance explained
- Balance dimensionality vs performance
- Feature engineering is iterative!

**Next:** Week 6 - More ML algorithms! ðŸš€