In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut

In [4]:
if __name__ == '__main__':
    pd.set_option('display.width', 300)
    pd.set_option('display.max_columns', 300)
    
    # 读取数据
    data = pd.read_csv('100组分类预测.csv', header=None)
    n_columns = len(data.columns)
    columns = ['A', 'B', 'C', 'D', 'E', 'F', 'M', 'N', 'Q', 'R', 'S', 'G', 'A4', 'A5', 'A6', 'A7', 'A8', 'H', 'Y', 'Z', 'A1', 'A2', 'A3', 'J', 'T', 'U', 'V', 'W', 'X', 'K', 'L']
    new_columns = dict(zip(np.arange(n_columns), columns))
    data.rename(columns=new_columns, inplace=True)
    
    print(data.head(10))
    
    # 存储所有节点的精度结果
    all_accuracies = {}
    
    # 初始化留一交叉验证器
    loo = LeaveOneOut()
    
    # 对每个特征列（除最后一列）进行预测和评估
    for target_col in columns[:-1]:
        print(f"\n=== 预测目标: {target_col} ===")
        
        # 创建特征矩阵（排除目标列）
        features = [col for col in columns if col != target_col]
        x = pd.DataFrame()
        
        # 对特征列进行独热编码
        for col in features:
            t = pd.get_dummies(data[col], prefix=col)
            x = pd.concat((x, t), axis=1)
        
        # 准备目标变量 - 将其转换为二分类问题
        target_values = pd.Categorical(data[target_col])
        categories = target_values.categories
        
        # 检查是否只有一个类别
        if len(categories) < 2:
            print(f"警告: 特征 {target_col} 只有一个类别 ({categories[0]})，跳过此列")
            continue
        
        # 如果类别数量大于2，将其分为两类
        if len(categories) > 2:
            mid_point = len(categories) // 2
            category_groups = {
                cat: 0 if i < mid_point else 1
                for i, cat in enumerate(categories)
            }
            y = np.array([category_groups[val] for val in target_values])
        else:
            # 如果本身就是二分类，直接使用编码
            y = target_values.codes
        
        # 再次检查处理后的目标变量是否只有一个类别
        unique_classes = np.unique(y)
        if len(unique_classes) < 2:
            print(f"警告: 特征 {target_col} 处理后只有一个类别 ({unique_classes[0]})，跳过此列")
            continue
        
        # 存储每次留一验证的预测结果
        y_true = []
        y_pred = []
        
        # 执行留一交叉验证
        for train_index, test_index in loo.split(x):
            x_train, x_test = x.iloc[train_index], x.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # 特征标准化 - 逻辑回归对特征尺度敏感
            scaler = StandardScaler()
            x_train_scaled = scaler.fit_transform(x_train)
            x_test_scaled = scaler.transform(x_test)
            
            # 训练逻辑回归模型
            clf = LogisticRegression(max_iter=1000, solver='liblinear')
            clf.fit(x_train_scaled, y_train)
            
            # 记录预测结果
            y_true.append(y_test[0])
            y_pred.append(clf.predict(x_test_scaled)[0])
        
        # 计算留一交叉验证的准确率
        loo_accuracy = metrics.accuracy_score(y_true, y_pred)
        
        print(f'{target_col} 留一交叉验证精确度: {loo_accuracy:.4f}')
        
        # 存储结果
        all_accuracies[target_col] = {
            'loo_accuracy': loo_accuracy
        }
    
    # 计算平均精度
    avg_loo_accuracy = np.mean([acc['loo_accuracy'] for acc in all_accuracies.values()])
    
    print("\n=== 所有节点的平均精度 ===")
    print(f"平均留一交叉验证精确度: {avg_loo_accuracy:.4f}")
   """ 
    # 输出每个节点的精度
    print("\n=== 各节点精度详情 ===")
    for col, acc in all_accuracies.items():
        print(f"{col}: 留一交叉验证 {acc['loo_accuracy']:.4f}")
    """

    A   B  C  D  E  F  M  N  Q  R  S  G  A4  A5  A6  A7  A8  H  Y  Z  A1  A2  A3  J  T  U  V  W  X  K  L
0   A   B  C  D  E  F  M  N  Q  R  S  G  A4  A5  A6  A7  A8  H  Y  Z  A1  A2  A3  J  T  U  V  W  X  K  L
1  20   4  1  1  1  0  0  0  0  0  0  0   0   0   0   1   1  0  0  0   1   0   1  1  1  0  0  0  0  1  0
2  35  12  1  1  0  0  1  0  1  1  0  0   1   0   1   1   0  0  1  1   1   1   1  1  1  0  1  1  1  1  0
3  20   8  1  1  2  1  1  0  1  2  1  0   0   0   0   1   1  0  0  0   1   0   1  1  1  0  0  1  1  1  0
4  20   4  1  0  1  0  1  0  1  2  1  0   0   0   0   0   2  0  0  0   0   0   1  1  1  0  0  1  0  1  0
5  35   8  1  0  1  0  1  1  1  2  1  1   1   1   0   1   0  0  0  1   1   0   1  1  1  1  0  1  1  1  0
6  55   4  0  1  2  0  1  0  0  2  1  0   1   1   1   0   2  0  0  1   1   1   1  1  0  0  0  0  0  0  0
7  35   4  1  0  1  0  1  0  1  0  0  0   0   0   0   1   0  0  1  1   1   0   1  1  1  0  0  0  1  1  0
8  55   8  1  1  1  0  1  1  1  2  0  0   1   0   0   1