# 基于分歧的半监督学习
1.使用有标签数据训练多个模型(模型可以是不同的)
2.使用每个模型去预测无标签的训练数据
3.筛选出模型之间预测产生分歧的无标签样本
4.采用某种策略为这些分歧样本分配标签，将这些伪标签加入训练集中，重新训练模型

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [5]:
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

# 生成带有标签的数据集
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_classes=2, random_state=42)

# 将有标签的数据转换为DataFrame格式，方便操作
df = pd.DataFrame(X)
df['label'] = y

# 随机选择一部分数据去掉标签，模拟无标签数据
# 设定 80% 的数据有标签，20% 的数据无标签
mask = np.random.rand(len(df)) < 0.8
df_labeled = df[mask].copy()  # 有标签的数据
df_unlabeled = df[~mask].copy()  # 无标签的数据

# 对无标签数据集，将标签列设置为NaN
df_unlabeled['label'] = np.nan

# 输出结果
print("有标签的数据:")
print(df_labeled.head())

print("\n无标签的数据:")
print(df_unlabeled.head())


有标签的数据:
          0         1         2         3         4         5         6  \
0  1.242872 -2.846242  1.631471  0.616130  1.024352  3.776219  2.207521   
2  0.995002  4.472860  0.396552 -0.774943 -0.539313  1.983609  1.571684   
3 -6.064907 -0.861145 -0.663774  0.639216  1.399097  0.464887 -7.810023   
4 -3.663457 -2.540896 -0.362081 -1.018162  1.939464 -1.736997 -2.255150   
5  0.263550 -2.176242  1.606168 -1.508781 -0.672219  0.808489  6.028618   

          7         8         9  ...        11        12        13        14  \
0 -4.202171  0.464731 -3.293098  ... -2.365571  1.826469  0.942230 -0.011058   
2 -1.178277  1.175303  1.666403  ...  2.167063 -1.271422 -0.150584 -2.553335   
3 -0.833776  1.906510  1.299077  ...  0.287488  0.340744  0.145996  0.116981   
4  6.621085 -3.086259 -5.767685  ...  3.516086  2.774681  0.623417  0.288686   
5 -2.314422 -2.085546  4.464265  ... -3.827730 -5.926362 -1.178996 -1.122092   

         15        16        17         18        19  label 

In [7]:
X_labeled = df_labeled.iloc[:,:-1]
X_unlabeled = df_unlabeled.iloc[:,:-1]
y_labeled = df_labeled.iloc[:,-1]

In [8]:
# 假设有一部分有标签数据和无标签数据
# 已知标签的数据和无标签数据
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2)

# 定义多个模型
model1 = SVC(probability=True)
model2 = RandomForestClassifier()

# 用有标签数据训练初始模型
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

# 使用无标签数据生成伪标签
for _ in range(10):  # 设置迭代次数
    # 对无标签数据进行预测
    preds_model1 = model1.predict(X_unlabeled)
    preds_model2 = model2.predict(X_unlabeled)
    
    # 选择两者分歧大的样本进行标注
    high_confidence_idx = (preds_model1 == preds_model2)
    
    # 将这些伪标签加入训练集中，并继续训练
    model1.fit(X_unlabeled[high_confidence_idx], preds_model1[high_confidence_idx])
    model2.fit(X_unlabeled[high_confidence_idx], preds_model2[high_confidence_idx])

# 最终使用测试集评估性能
final_preds = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, final_preds))


Accuracy: 0.79375
