# Combining PCA and logisitic regression on the wine dataset

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

STANDARDIZE_FEATURES = True

# Load the wine dataset
data = datasets.load_wine()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Standardize the features
if STANDARDIZE_FEATURES:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

## PCA and Logistic Regression

In [None]:
# Perform PCA on the dataset selecting the first two principal components
pca = PCA(n_components=2)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_pca, y_train)

# Plotting the results
fig, ax = plt.subplots(1, 2, figsize=(8, 4))

DecisionBoundaryDisplay.from_estimator(model, X_train_pca, ax=ax[0], alpha=0.5)
DecisionBoundaryDisplay.from_estimator(model, X_test_pca, ax=ax[1], alpha=0.5)

ax[0].scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train)
ax[0].set_xlabel("Principal Component 1")
ax[0].set_ylabel("Principal Component 2")
accuracy = accuracy_score(y_train, model.predict(X_train_pca))
ax[0].set_title(f"Training data, accuracy: {accuracy:.2f}")
ax[1].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
ax[1].set_xlabel("Principal Component 1")
ax[1].set_ylabel("Principal Component 2")
accuracy = accuracy_score(y_test, model.predict(X_test_pca))
ax[1].set_title(f"Test data, accuracy: {accuracy:.2f}")
fig.tight_layout()
plt.show()

In [None]:
# A box plot of the original data
fig, ax = plt.subplots()
ax.boxplot(X_train)
ax.set_xticklabels(data.feature_names, rotation=90)
plt.show()

In [None]:
# Check the loadings on the first two principal components
# This will give some insight which original features contribute most to the 2 principal components
print("Loadings on the first two principal components:")
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

fig, ax = plt.subplots(1, 2, figsize=(8, 4), sharey=True)
num_features = len(data.feature_names)

for i in range(2):
    ax[i].barh(range(num_features), loadings[:, i])
    ax[i].set_yticks(range(num_features))
    ax[i].set_yticklabels(data.feature_names)
    ax[i].set_title(f"Principal Component {i+1}")
    ax[i].set_xlabel("Loading")

fig.tight_layout()
plt.show()

In [None]:
# pair plot of the first two principal components
X_train_pca_df = pd.DataFrame(X_train_pca, columns=["PC1", "PC2"])
X_train_pca_df["target"] = y_train
sns.pairplot(X_train_pca_df, hue="target", height=2, palette="viridis")

## Sequential Feature Selection and Logistic Regression

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Instead use sequential feature selection to select the most important features
model = LogisticRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=2, direction="backward")
sfs.fit(X_train, y_train)
X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)

# Print selected features
print("Selected features:")
features = np.array(data.feature_names)[sfs.get_support(True)]
print(features)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_sfs, y_train)

# Plotting the results
fig, ax = plt.subplots(1, 2, figsize=(8, 4))

DecisionBoundaryDisplay.from_estimator(model, X_train_sfs, ax=ax[0], alpha=0.5)
DecisionBoundaryDisplay.from_estimator(model, X_test_sfs, ax=ax[1], alpha=0.5)

ax[0].scatter(X_train_sfs[:, 0], X_train_sfs[:, 1], c=y_train)
ax[0].set_xlabel(f"{features[0]}")
ax[0].set_ylabel(f"{features[1]}")
accuracy = accuracy_score(y_train, model.predict(X_train_sfs))
ax[0].set_title(f"Training data, accuracy: {accuracy:.2f}")
ax[1].scatter(X_test_sfs[:, 0], X_test_sfs[:, 1], c=y_test)
ax[1].set_xlabel(f"{features[0]}")
ax[1].set_ylabel(f"{features[1]}")
accuracy = accuracy_score(y_test, model.predict(X_test_sfs))
ax[1].set_title(f"Test data, accuracy: {accuracy:.2f}")
fig.tight_layout()
plt.show()