## 0. Import Library

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from scipy.spatial import distance
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

## 1. Load dataset and describe data distribution (Wine Data)

### 1.1 Define analysis functions

In [2]:
# 1. 类别分布
def class_distribution(df):
    print("\n### Class Distribution ###")
    class_counts = df['label'].value_counts()
    print(class_counts)
    
    # 可视化类别分布
    plt.figure(figsize=(6, 4))
    class_counts.plot(kind='bar', title='Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.show()

# 2. 描述性统计和特征分布
def descriptive_statistics_and_distribution(df, features):
    print("\n### Descriptive Statistics ###")
    print(df.describe())
    
    print("\n### Feature Distributions ###")
    # 设置子图的网格大小，n行3列，每个子图对应一个特征
    n = len(features)
    rows = (n + 2) // 3  # 计算行数，确保能放下所有子图
    
    fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))  # 创建子图
    axes = axes.flatten()  # 将子图数组展平成1维数组，方便遍历
    
    # 遍历每个特征并创建子图
    for i, feature in enumerate(features):
        ax = axes[i]  # 选择当前子图
        df[feature].hist(bins=20, ax=ax)
        ax.set_title(f'Distribution of {feature}')
        ax.set_xlabel(feature)
        ax.set_ylabel('Frequency')
    
    # 删除多余的空白子图
    for i in range(n, len(axes)):
        fig.delaxes(axes[i])
    
    # 调整布局，避免子图重叠
    plt.tight_layout()
    plt.show()

# 3. 箱线图
def boxplot_of_features(df, features):
    print("\n### Boxplots of Features by Class ###")
    
    # 设置子图的网格大小，n行3列，每个子图对应一个特征
    n = len(features)
    rows = (n + 2) // 3  # 计算行数，确保能放下所有子图
    
    fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))  # 创建子图
    axes = axes.flatten()  # 将子图数组展平成1维数组，方便遍历
    
    # 遍历每个特征并创建子图
    for i, feature in enumerate(features):
        ax = axes[i]  # 选择当前子图
        df.boxplot(column=feature, by='label', ax=ax)
        ax.set_title(f'{feature} Distribution by Class')
        ax.set_xlabel('Class')
        ax.set_ylabel(feature)
    
    # 删除多余的空白子图
    for i in range(n, len(axes)):
        fig.delaxes(axes[i])
    
    # 调整布局，避免子图重叠
    plt.tight_layout()
    plt.suptitle('')  # 去掉默认的子图标题
    plt.show()

# 4. 相关性分析
def correlation_analysis(df):
    print("\n### Correlation Matrix ###")
    
    # 计算相关性矩阵
    correlation_matrix = df.corr()
    print(correlation_matrix)
    
    # 可视化相关性矩阵
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Feature Correlation Matrix')
    plt.show()

In [6]:
def describe_data(data_path):
    """
    Comprehensive description function that calls individual functions to:
    1. Show class distribution.
    2. Provide descriptive statistics for each feature.
    3. Display boxplots for each feature, comparing distributions by class.
    4. Analyze and visualize the correlation between features.
    
    Parameters:
    - data_path: str, path to the CSV file of the dataset.
    
    The features are automatically extracted from the CSV's first row, excluding the 'label' column.
    """
    # 加载数据
    df = pd.read_csv(data_path)
    
    # 自动提取特征列表（排除 label 列）
    features = [col for col in df.columns if col != 'label']
    
    # 调用各个功能函数
    print("\nStep 1: Class Distribution")
    class_distribution(df)          # 类别分布
    
    print("\nStep 2: Descriptive Statistics and Feature Distribution")
    descriptive_statistics_and_distribution(df, features) # 描述性统计和特征分布
    
    print("\nStep 3: Boxplots of Features by Class")
    boxplot_of_features(df, features)  # 特征箱线图
    
    print("\nStep 4: Correlation Analysis")
    correlation_analysis(df)        # 相关性分析

### 1.2 Load data and analysis

In [None]:
# 测试
data_path = 'data/wine_data.csv'
describe_data(data_path)

## 2. Data preprocessing

### 2.1 Wine Data

In [13]:
# 加载数据
df_wine = pd.read_csv('data/wine_data.csv')

# 假设 df_wine 是包含特征和标签的 DataFrame，标签列为 'label'
X = df_wine.drop('label', axis=1)  # 特征数据
y = df_wine['label']  # 标签

# 1. 进行过采样
smote = SMOTE(random_state=42)
X_resampled, y_wine = smote.fit_resample(X, y)

# 2. 进行归一化
scaler = StandardScaler()
X_wine = scaler.fit_transform(X_resampled)

### 2.2 MNIST Data

In [None]:
# 加载 MNIST 数据集
mnist = fetch_openml('mnist_784', version=1)

# 获取数据和标签
X_mnist, y_mnist = mnist['data'], mnist['target']

# 数据归一化
X_mnist = X_mnist.astype('float32') / 255.0

# 查看数据集的形状
print(f"数据集形状: {X_mnist.shape}, 标签形状: {y_mnist.shape}")

# 将标签转换为整型（默认是字符串类型）
y = y.astype(int)

# 数据集已经是展平形式，每个图像有784个特征



## 3. PCA and LDA approach

### 3.1 PCA

In [None]:
# 参数化的 PCA 结果和可视化函数
def visualize_3d_pca(X_pca, y_resampled, title="PCA 3D 可视化"):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    # 绘制 3D 散点图，使用标签进行着色
    sc = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=y_resampled, cmap='viridis', edgecolor='k')

    # 设置坐标轴标签
    ax.set_xlabel('第一个主成分')
    ax.set_ylabel('第二个主成分')
    ax.set_zlabel('第三个主成分')
    ax.set_title(title)

    # 添加颜色条（显示类别标签）
    plt.colorbar(sc, label='类别标签')

    # 显示图形
    plt.show()

#### wine data

In [15]:
# 3. 进行 PCA 降维
pca3 = PCA(n_components=3)
pca4 = PCA(n_components=4)
pca95 = PCA(n_components=0.95)

X_wine_pca3 = pca3.fit_transform(X_wine)
X_wine_pca4 = pca4.fit_transform(X_wine)
X_wine_pca95 = pca95.fit_transform(X_wine)

In [None]:
# 调用可视化函数，分别展示不同 PCA 结果的 3D 散点图
visualize_3d_pca(X_wine_pca3, y_wine, title="PCA 3D 可视化 - 3 主成分")
visualize_3d_pca(X_wine_pca4, y_wine, title="PCA 3D 可视化 - 4 主成分 (选择前三个维度)")
visualize_3d_pca(X_wine_pca95[:, :3], y_wine, title="PCA 3D 可视化 - 0.95 方差保留 (选择前三个维度)")

#### MNIST

In [None]:
X_mnist_pca95 = pca95.fit_transform(X_mnist)

# 调用可视化函数，展示 MNIST 数据集的 3D PCA 结果
visualize_3d_pca(X_mnist_pca95[:, :3], y_mnist, title="MNIST 数据集 PCA 3D 可视化 - 0.95 方差保留 (选择前三个维度)")

### 3.2 LDA

#### Wine Data

#### MNIST

## 4. Define Classifiers

In [None]:
# Mahalanobis Classifier

class MahalanobisClassifier:
    def __init__(self):
        self.class_means = {}
        self.class_cov_inv = {}

    def fit(self, X, y):
        classes = np.unique(y)
        for cls in classes:
            X_class = X[y == cls]
            self.class_means[cls] = np.mean(X_class, axis=0)
            cov_matrix = np.cov(X_class, rowvar=False)
            self.class_cov_inv[cls] = np.linalg.inv(cov_matrix)

    def predict(self, X):
        y_pred = []
        for x in X:
            distances = []
            for cls in self.class_means:
                mean = self.class_means[cls]
                cov_inv = self.class_cov_inv[cls]
                d = distance.mahalanobis(x, mean, cov_inv)
                distances.append((cls, d))
            y_pred.append(min(distances, key=lambda t: t[1])[0])
        return np.array(y_pred)


In [None]:
# Linear Classifier

class LinearClassifier:
    def __init__(self):
        self.model = LogisticRegression()

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

In [None]:
# Nearest Neighbor Classifier

class NearestNeighborClassifier:
    def __init__(self, n_neighbors=3):
        self.model = KNeighborsClassifier(n_neighbors=n_neighbors)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)


In [None]:
def compare_classifiers(X, y, classifiers: dict, test_size=0.2):
    '''
    # 使用示例
    classifiers = {
        "Mahalanobis": MahalanobisClassifier(),
        "Linear": LinearClassifier(),
        "Nearest Neighbor": NearestNeighborClassifier()
    }
    # 对于数据集：
    compare_classifiers(X, y, classifiers)
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    results = {}
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"{name} 分类器的准确性: {acc:.4f}")

    return results