### PCA in Machine Learning Workflows
#### Machine Learning I - Maestría en Analítica Aplicada
#### Universidad de la Sabana
#### Prof: Hugo Franco
#### Example: Principal Component Analysis

<img src="culmen_depth.png" width="50%">

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

# Load the penguins dataset
penguins = sns.load_dataset('penguins')

# Display initial information
print("Dataset Overview:")
print(penguins.info())
print("\nClass Distribution:")
print(penguins['species'].value_counts())

#### Imputation strategies implemented in SimpleImputer 
* mean (default for numeric data)
* median (usually more robust than mean)
* most_frequent
* constant (requires the filling value)

In [None]:
# Define feature groups
numeric_features = ['bill_length_mm', 'bill_depth_mm', 
                   'flipper_length_mm', 'body_mass_g']
categorical_features = ['sex', 'island']

# Create the numeric transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create the categorical transformer with one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(
        drop='first',  # Drop first category to avoid multicollinearity
        sparse_output=False,  # Return dense array instead of sparse matrix
        handle_unknown='ignore'  # Handle new categories in test data
    ))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Print encoded feature names
def get_feature_names(preprocessor):
    # Get feature names from numeric features
    numeric_features_out = numeric_features

    # Get feature names from categorical features after encoding
    cat_features = (preprocessor
                   .named_transformers_['cat']
                   .named_steps['onehot']
                   .get_feature_names_out(categorical_features))
    
    # Combine both feature sets
    return numeric_features_out + list(cat_features)

In [None]:
# Create full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42
    ))
])

# Prepare data
X = penguins.drop(['species'], axis=1)
y = penguins['species']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train model
model.fit(X_train, y_train)

# Get predictions
y_pred = model.predict(X_test)

# Print performance metrics
print("\nModel Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualize confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=penguins['species'].unique(),
            yticklabels=penguins['species'].unique())
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Get feature names after encoding
feature_names = get_feature_names(preprocessor)
print("\nEncoded Feature Names:")
print(feature_names)

A (risky) method to deal with potential outliers: quartile capping.

In [None]:
def cap_outliers(df, columns, lower_percentile=1, upper_percentile=99):
    df_capped = df.copy()
    for column in columns:
        lower = np.percentile(df[column].dropna(), lower_percentile)
        upper = np.percentile(df[column].dropna(), upper_percentile)
        df_capped[column] = df_capped[column].clip(lower=lower, upper=upper)
    return df_capped

# Get numerical columns from features only (excluding target)
numerical_cols_features = X.select_dtypes(include=['float64', 'int64']).columns

# Apply outlier capping
X_capped = cap_outliers(X, numerical_cols_features)

# Create pipeline with outlier capping and proper preprocessing
capped_pipe = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('imputer', KNNImputer(n_neighbors=5))
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(
                drop='first',
                sparse_output=False,
                handle_unknown='ignore'
            ))
        ]), categorical_features)
    ])),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Evaluate with outlier capping
X_train_capped, X_test_capped, y_train, y_test = train_test_split(
    X_capped, y, test_size=0.2, random_state=42)

capped_pipe.fit(X_train_capped, y_train)
y_pred_capped = capped_pipe.predict(X_test_capped)

# Evaluate results with visualization
print("\nResults with Outlier Capping:")
print("Accuracy:", accuracy_score(y_test, y_pred_capped))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_capped, zero_division=0))

# Visualize confusion matrix for capped results
plt.figure(figsize=(8, 6))
cm_capped = confusion_matrix(y_test, y_pred_capped)
sns.heatmap(cm_capped, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - With Outlier Capping')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

#### Challenge (Workshop)
1. Use the following code stub to perform the same task on the Cleveland Heart Disease dataset. Test the impact of each imputation strategy in the model performance. 
2. Compare the performance of Random Forests vs. XGBoost


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
           'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv(url, names=columns, na_values='?')