# Page 1 CL Methods
本章主要介绍置信学习（Confident Learning）的计算过程

In [7]:
import numpy as np
import pandas as pd

# STEP 0 构建输入样本
- 总样本量为10
- 人工标签（label_origin）是0的样本有5条（前五个样本）
- 人工标签（label_origin）是1的样本有5条（后五个样本）

In [68]:
p_0 = [0.9, 0.9, 0.5, 0.3, 0.3, 0.2, 0.2, 0.4, 0.5, 0.6,0.9, 0.9, 0.5, 0.3, 0.3, 0.2, 0.2, 0.4, 0.5, 0.6]
p_1 = [0.1, 0.1, 0.5, 0.7, 0.7, 0.9, 0.8, 0.7, 0.5, 0.4,0.1, 0.1, 0.5, 0.7, 0.7, 0.9, 0.8, 0.7, 0.5, 0.4]
label_origin = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1,0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# 构建DataFrame
dict_data = {
    "j=0": p_0,
    "j=1": p_1,
    "label_origin==0": label_origin,
}
df_data = pd.DataFrame(dict_data)
df_data

Unnamed: 0,j=0,j=1,label_origin==0
0,0.9,0.1,0
1,0.9,0.1,0
2,0.5,0.5,0
3,0.3,0.7,0
4,0.3,0.7,0
5,0.2,0.9,1
6,0.2,0.8,1
7,0.4,0.7,1
8,0.5,0.5,1
9,0.6,0.4,1


# STEP 1 计算标签阈值
人工标签x的阈值计算方法：所有人工标签为x的样本的预测标签为x的概率平均值

In [70]:
list_0 = list()
list_1 = list()
for index, column in df_data.iterrows():
    if column["label_origin==0"] == 0:
        list_0.append(column["j=0"])
    if column["label_origin==0"] == 1:
        list_1.append(column["j=1"])
# 前五条记录的人工标签是0，所以j==0的阈值是前五条记录，在类别为0的概率平均值
average_p_0 = np.mean(list_0)
# 后五条记录的人工标签是1，所以j==1的阈值是后五条记录，在类别为1的概率平均值
average_p_1 = np.mean(list_1)

print("The thresholds of [j==0] is {} and [j==1] is {}".format(average_p_0, average_p_1))

The thresholds of [j==0] is 0.5799999999999998 and [j==1] is 0.66


# STEP 2 计算联合统计分布矩阵
1. 初始化矩阵
2. 遍历所有样本
    - 如果标签a的概率大于标签b的概率，且标签a的概率大于标签a的概率阈值，则此样本真实标签为a
    - 如果该样本人工标签为a，则在矩阵C[预测标签_a, 真实标签_a]处加1
    - 如果该样本人工标签为b，则在矩阵C[预测标签_a, 真实标签_b]处加1

In [71]:
dict_co_count = {
    "true_0": [0, 0],
    "true_1": [0, 0],
}

df_co_count = pd.DataFrame(dict_co_count, index=["pred_0", "pred_1"])
df_co_count

Unnamed: 0,true_0,true_1
pred_0,0,0
pred_1,0,0


In [72]:
for index in range(len(label_origin)):
    if p_0[index] > p_1[index] and p_0[index] > average_p_0:
        if label_origin[index] == 0:
            df_co_count.loc["pred_0", "true_0"] = df_co_count.at["pred_0", "true_0"] + 1
        if label_origin[index] == 1:
            df_co_count.loc["pred_1", "true_0"] = df_co_count.at["pred_1", "true_0"] + 1
    if p_1[index] > p_0[index] and p_1[index] > average_p_1:
        if label_origin[index] == 1:
            df_co_count.loc["pred_1", "true_1"] = df_co_count.at["pred_1", "true_1"] + 1
        if label_origin[index] == 0:
            df_co_count.loc["pred_0", "true_1"] = df_co_count.at["pred_0", "true_1"] + 1
df_co_count

Unnamed: 0,true_0,true_1
pred_0,4,4
pred_1,2,6


# STEP 3 校准数据分布
由于在统计真实标签数量的时候，有限制条件大于标签概率阈值，所以会有部分数据未被纳入统计，导致数据分布发生改变；
所以需要对联合统计分布矩阵中的数据进行重新校准，使之与原始分布相同

In [78]:
num_label_origin_0, num_label_origin_1 = 10, 10

for index, column in df_co_count.iterrows():
    y_i_j = column["true_0"] + column["true_1"]
    df_co_count.loc[index, 'true_0'] = num_label_origin_0 * column["true_0"]/y_i_j
    df_co_count.loc[index, 'true_1'] = num_label_origin_0 * column["true_1"]/y_i_j

df_co_count

Unnamed: 0,true_0,true_1
pred_0,5.0,5.0
pred_1,2.5,7.5


# STEP 4 计算置信度联合概率分布矩阵
置信度联合统计矩阵中每个元素除以总元素和

In [79]:
dict_co_prob = {
    "true_0": [0., 0.],
    "true_1": [0., 0.],
}
df_co_prob = pd.DataFrame(dict_co_prob, index=["pred_0", "pred_1"])
df_co_prob

Unnamed: 0,true_0,true_1
pred_0,0.0,0.0
pred_1,0.0,0.0


In [80]:
total = 0
for column in ["pred_0", "pred_1"]:
    for row in ["true_0", "true_1"]:
        total += df_co_count.at[column, row]

print("分母是【{}】".format(total))

for column in ["pred_0", "pred_1"]:
    for row in ["true_0", "true_1"]:
        df_co_prob.loc[column, row] = df_co_count.at[column, row]/total
df_co_prob

分母是【20.0】


Unnamed: 0,true_0,true_1
pred_0,0.25,0.25
pred_1,0.125,0.375


# STEP 5 找出标签错误样本
- Method_1: 预测标签与人工标签不一致的样本
- Method_2: 不在联合统计矩阵对角线上的数据
- Method_3: Prune by Class (PBC)
- Method_4: Prune by Noise Rate (PBNR)
- Method_5: C+NR

In [89]:
import cleanlab
# 输入
# s:噪声标签
# psx: n x m 的预测概率概率，通过交叉验证获得
s = list()
psx = list()
for index in range(len(label_origin)):
    s.append(label_origin[index])
    psx.append(
        [p_0[index], p_1[index]]
    )
s = np.array(s)
psx = np.array(psx)
# Method 3：Prune by Class (PBC)
cl_pbc = cleanlab.pruning.get_noise_indices(
    s,
    psx,
    prune_method='prune_by_class',
    sorted_index_method='prob_given_label'
)
print("The Index of Error Samples are: {}".format(",".join([str(ele) for ele in cl_pbc])))
# Method 4：Prune by Noise Rate (PBNR)
cl_pbnr = cleanlab.pruning.get_noise_indices(
    s,
    psx,
    prune_method='prune_by_noise_rate',
    sorted_index_method='prob_given_label'
)
print("The Index of Error Samples are: {}".format(",".join([str(ele) for ele in cl_pbnr])))
# Method 5：C+NR
cl_both = cleanlab.pruning.get_noise_indices(
    s,
    psx,
    prune_method='both',
    sorted_index_method='prob_given_label'
)
print("The Index of Error Samples are: {}".format(",".join([str(ele) for ele in cl_both])))

The Index of Error Samples are: 3,4,13,14,9,19
The Index of Error Samples are: 3,4,13,14,9,19
The Index of Error Samples are: 3,4,13,14,9,19
