In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target

In [None]:
X = df.drop(columns=['Target'])
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# --- Dimensionality Reduction Techniques ---

# 1. Principal Component Analysis (PCA)
pca = PCA(n_components=2) #Or PCA(0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 2. t-SNE (for visualization)
tsne = TSNE(n_components=2, random_state=42)
X_train_tsne = tsne.fit_transform(X_train_scaled)
X_test_tsne = tsne.fit_transform(X_test_scaled)

# 3. Linear Discriminant Analysis (LDA)|
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

In [None]:
# Creating a sample categorical dataset
cat_data = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],
    'Size': ['S', 'M', 'L', 'M', 'S'],
    'Label': [1, 0, 1, 0, 1]
})

In [None]:
# 1. Label Encoding
label_encoder = LabelEncoder()
cat_data['Color_Label'] = label_encoder.fit_transform(cat_data['Color'])

# 2. One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = onehot_encoder.fit_transform(cat_data[['Size']])
cat_data = cat_data.join(pd.DataFrame(one_hot_encoded, columns=onehot_encoder.get_feature_names_out(['Size'])))

In [None]:
cat_data

Unnamed: 0,Color,Size,Label,Color_Label,Size_L,Size_M,Size_S
0,Red,S,1,2,0.0,0.0,1.0
1,Blue,M,0,0,0.0,1.0,0.0
2,Green,L,1,1,1.0,0.0,0.0
3,Blue,M,0,0,0.0,1.0,0.0
4,Red,S,1,2,0.0,0.0,1.0


In [None]:
# --- Cross-Validation Techniques ---

# Model for cross-validation
model = LogisticRegression(max_iter=200)

# 1. K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf)

# 2. Stratified K-Fold Cross-Validation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_scores = cross_val_score(model, X_train_scaled, y_train, cv=skf)

# 3. Leave-One-Out Cross-Validation (LOO)
loo = LeaveOneOut()
loo_scores = cross_val_score(model, X_train_scaled, y_train, cv=loo)

In [None]:
print("PCA Explained Variance Ratio:", pca.explained_variance_ratio_)
print("LDA Explained Variance Ratio:", lda.explained_variance_ratio_)
print("K-Fold CV Scores:", kf_scores)
print("Stratified K-Fold CV Scores:", skf_scores)
print("Leave-One-Out CV Scores (Mean):", np.mean(loo_scores))

PCA Explained Variance Ratio: [0.43502782 0.19500007]
LDA Explained Variance Ratio: [1.]
K-Fold CV Scores: [0.95604396 0.98901099 0.97802198 0.97802198 0.98901099]
Stratified K-Fold CV Scores: [0.95604396 0.94505495 0.97802198 0.98901099 0.94505495]
Leave-One-Out CV Scores (Mean): 0.978021978021978
