### PCA in Machine Learning Workflows
#### Machine Learning I - Maestría en Analítica Aplicada
#### Universidad de la Sabana
#### Prof: Hugo Franco
#### Example: XGBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from xgboost import plot_importance  # For feature importance visualization
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("iris.csv")
print(df.head())

print("\nColumns:", df.columns)

X = df.drop("Species", axis=1)
y = df["Species"]

In [None]:
# Encode string labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
model = XGBClassifier(
    num_class=len(le.classes_), 
    eval_metric='mlogloss',
    use_label_encoder=False,
    max_depth=4,
    learning_rate=0.3,
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)

In [None]:
 # Predict class labels
y_pred = model.predict(X_test)

# Predict class probabilities
y_prob = model.predict_proba(X_test)


print("\n Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n",
      classification_report(y_test, y_pred, target_names=le.classes_))

print("\n Predicted class labels:", y_pred[:5])
print(" Actual class labels   :", y_test[:5])

print("\n Predicted probabilities for first 5 samples:\n", y_prob[:5])

In [None]:
plot_importance(model, importance_type='weight', xlabel='F-Score')
plt.title("Feature Importance (by F-Score)")
plt.show()