In [3]:
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [21]:
data = np.loadtxt(open("w2v_cleaned_mean_shuffle_5000.csv", "rb"), delimiter=",", skiprows=0)
X = data[:, : -1]  # 输入
y = data[:, -1]  # 标签
print("len(data)",len(data))
rng = np.random.RandomState(3)  # 随机数种子
indices = np.arange(len(data))
rng.shuffle(indices)  # 将索引打乱
train_indices = indices[400:]
print("train_indices", len(train_indices), train_indices)
test_indices = indices[:400]
print("test_indices", len(test_indices), test_indices)


lp_model = LabelSpreading(gamma=0.32, max_iter=30)# 下一步循环得到最好结果的gamma
lp_model.fit(X[train_indices], y[train_indices])
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
print(pred_entropies,len(pred_entropies))
# 选择分类器最不确定的最多5位数字示例
uncertainty_index = np.argsort(pred_entropies)[::-1]
print(uncertainty_index,len(uncertainty_index))
uncertainty_index = uncertainty_index[
np.in1d(uncertainty_index, train_indices)][:1600]
print(uncertainty_index,len(uncertainty_index))

as_labeled_indices = uncertainty_index # 挑选出的1600个样本作为传播的起点
new_index = np.concatenate((as_labeled_indices, test_indices),axis=0)# 总共的2000样本索引
new_x = X[new_index]
new_y = y[new_index]
new_y[1600:] = -1
print("len(new_x)",len(new_x))
print("len(new_y)",len(new_y))

lp_model.fit(new_x, new_y)
predicted_labels = lp_model.transduction_[1600:]
print("predicted_labels", predicted_labels)
true_labels = y[test_indices]
print("true_labels",true_labels)

cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

print(classification_report(true_labels, predicted_labels, digits=4))
print("Confusion matrix")
print(cm)

len(data) 5596
train_indices 5196 [2180 5577  111 ... 1667 3321 1688]
test_indices 400 [3064 4688 4763 4648  893 4782 2082 4329 3033 4183 2165 4615 3579 3465
 4979 1793 4817 3528 4975  349 1989 4885 3406   21 2326  836 3706 3952
 2375  933 1182 2494 1352 3887 3491 4788  808 3455 2674 4787 1746 4297
 2315 5407 2019 4934 4614  779  279 4233  617 3914 4751 4089   17 5406
 3815 5191  884   94  350 4412 1053 2369  454 4080 5287 3229 4864 1964
 2818 3296 3172 3145 4178 3702 3937 1361 2897 3805 4133 3723 4018 2719
 1176 5411 1370 1094 1743 4261 3561 4264 1458 3677 4993 4126 4118  602
 4084 5144  569 1891  640 5188 2055 4064  666  531  862 4857  451 1166
   33 3171 3865 1203 4870 5516 1577  261  281 2475 4044 3592 3066 3021
 5319 5017 2872 1801 5296 2495 1757 4766 1617  158 2696  955 1822 4899
 3549 1300 3313 5110 1260 3113 2260 3503 1879  205  762 5002 4121 3829
 1624   23 2559   24    9   87 4732 2465 2378 4212 4225 3248 5129 4525
 2022 3095 2882 3716 5488 2802 1655 1117 4170 1876  626  899 

In [10]:
# 加了个循环
data = np.loadtxt(open("w2v_cleaned_mean_shuffle_5000.csv", "rb"), delimiter=",", skiprows=0)
X = data[:, : -1]  # 输入
y = data[:, -1]  # 标签
rng = np.random.RandomState(2)  # 随机数种子
indices = np.arange(len(data))
rng.shuffle(indices)  # 将索引打乱
train_indices = indices[400:]
test_indices = indices[:400]

num=100
gammas = np.logspace(-2, 3, num=num)
for i in range(num):
    lp_model = LabelSpreading(gamma=gammas[i], max_iter=30)
    lp_model.fit(X[train_indices], y[train_indices])
    pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
    # print(pred_entropies,len(pred_entropies))
    # 选择分类器最不确定的最多5位数字示例
    uncertainty_index = np.argsort(pred_entropies)[::-1]
    # print(uncertainty_index,len(uncertainty_index))
    uncertainty_index = uncertainty_index[
    np.in1d(uncertainty_index, train_indices)][:1600]


    as_labeled_indices = uncertainty_index # 挑选出的1600个样本作为传播的起点
    new_index = np.concatenate((as_labeled_indices, test_indices),axis=0)# 总共的2000样本索引
    new_x = X[new_index]
    new_y = y[new_index]
    new_y[1600:] = -1

    lp_model.fit(new_x, new_y)
    predicted_labels = lp_model.transduction_[1600:]
    true_labels = y[test_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                              labels=lp_model.classes_)

    print(i,gammas[i],accuracy_score(true_labels, predicted_labels))
    # print(classification_report(true_labels, predicted_labels, digits=4))
    # # print("Confusion matrix")
    # print(cm)


0 0.01 0.63
1 0.011233240329780276 0.6275
2 0.012618568830660204 0.63
3 0.014174741629268055 0.63
4 0.015922827933410922 0.625
5 0.01788649529057435 0.62
6 0.02009233002565047 0.62
7 0.022570197196339202 0.605
8 0.025353644939701114 0.61
9 0.02848035868435802 0.6125
10 0.03199267137797385 0.6225
11 0.03593813663804628 0.6425
12 0.040370172585965536 0.65
13 0.04534878508128582 0.6625
14 0.0509413801481638 0.685
15 0.05722367659350217 0.6975
16 0.06428073117284319 0.7075
17 0.07220809018385464 0.715
18 0.08111308307896872 0.735
19 0.09111627561154892 0.74
20 0.10235310218990264 0.745
21 0.11497569953977356 0.76
22 0.1291549665014884 0.765
23 0.14508287784959395 0.7725
24 0.16297508346206444 0.7725
25 0.18307382802953678 0.7775
26 0.20565123083486514 0.8
27 0.23101297000831592 0.795
28 0.25950242113997357 0.8075
29 0.2915053062825176 0.8025
30 0.32745491628777285 0.8075
31 0.36783797718286326 0.7975
32 0.41320124001153363 0.8025
33 0.464158883361278 0.79
34 0.5214008287999684 0.7875
35 0.

In [73]:
# 修改了model.fit里的条件
data = np.loadtxt(open("w2v_cleaned_mean_shuffle_5000.csv", "rb"), delimiter=",", skiprows=0)
X = data[:, : -1]  # 输入
y = data[:, -1]  # 标签
print("len(data)",len(data))
rng = np.random.RandomState(8)  # 随机数种子
indices = np.arange(len(data))
rng.shuffle(indices)  # 将索引打乱
train_indices = indices[400:]
print("train_indices", len(train_indices), train_indices)
test_indices = indices[:400]
# print("test_indices", len(test_indices), test_indices)

initial_labeled_points = 300
unlabeled_indices = train_indices[initial_labeled_points:]
X_train = X[train_indices]
y_train = y[train_indices]
y_train[initial_labeled_points:] = -1


lp_model = LabelSpreading(gamma=0.79, max_iter=30)# 下一步循环得到最好结果的gamma
lp_model.fit(X_train, y_train)
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
print(pred_entropies,len(pred_entropies))
# 选择分类器最不确定的最多5位数字示例
uncertainty_index = np.argsort(pred_entropies)[::-1]
print(uncertainty_index,len(uncertainty_index))
uncertainty_index = uncertainty_index[
np.in1d(uncertainty_index, train_indices[300:])][:1300]
print(uncertainty_index,len(uncertainty_index))

as_labeled_indices = np.concatenate((train_indices[:300], uncertainty_index),axis=0) # 挑选出的1600个样本作为传播的起点
new_index = np.concatenate((as_labeled_indices, test_indices),axis=0)# 总共的2000样本索引
new_x = X[new_index]
new_y = y[new_index]
new_y[1600:] = -1
print("len(new_x)",len(new_x))
print("len(new_y)",len(new_y))

lp_model.fit(new_x, new_y)
predicted_labels = lp_model.transduction_[1600:]
# print("predicted_labels", predicted_labels)
true_labels = y[test_indices]
# print("true_labels",true_labels)

cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

print(classification_report(true_labels, predicted_labels, digits=4))
print("Confusion matrix")
print(cm)

len(data) 5596
train_indices 5196 [2307 2178 1502 ... 2033 1364 4547]
[3.67249800e-02 1.77262381e-10 1.05291602e-03 ... 5.59724013e-08
 7.13049128e-02 5.59724013e-08] 5196
[2854 2754 2306 ...  111   52  100] 5196
[2854 2754 2306 ... 4705  882 1391] 1300
len(new_x) 2000
len(new_y) 2000
              precision    recall  f1-score   support

         0.0     0.8534    0.8250    0.8390       120
         1.0     0.8654    0.8654    0.8654       104
         2.0     0.7615    0.7615    0.7615       109
         3.0     0.7042    0.7463    0.7246        67

    accuracy                         0.8050       400
   macro avg     0.7961    0.7995    0.7976       400
weighted avg     0.8065    0.8050    0.8056       400

Confusion matrix
[[99 11  8  2]
 [ 8 90  3  3]
 [ 8  2 83 16]
 [ 1  1 15 50]]


In [82]:
# 加了个循环
data = np.loadtxt(open("w2v_cleaned_mean_shuffle_5000.csv", "rb"), delimiter=",", skiprows=0)
X = data[:, : -1]  # 输入
y = data[:, -1]  # 标签
rng = np.random.RandomState(5)  # 随机数种子
indices = np.arange(len(data))
rng.shuffle(indices)  # 将索引打乱
train_indices = indices[400:]
test_indices = indices[:400]

initial_labeled_points = 300
unlabeled_indices = train_indices[initial_labeled_points:]
X_train = X[train_indices]
y_train = y[train_indices]
y_train[initial_labeled_points:] = -1

num=100
gammas = np.logspace(-2, 2, num=num)
np.seterr(divide='ignore',invalid='ignore')
for i in range(num):
    lp_model = LabelSpreading(gamma=gammas[i], max_iter=30)
    lp_model.fit(X_train, y_train)
    pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
    # print(pred_entropies,len(pred_entropies))
    # 选择分类器最不确定的最多5位数字示例
    uncertainty_index = np.argsort(pred_entropies)[::-1]
    # print(uncertainty_index,len(uncertainty_index))
    uncertainty_index = uncertainty_index[
    np.in1d(uncertainty_index, train_indices)][:1300]


    as_labeled_indices = np.concatenate((train_indices[:300], uncertainty_index),axis=0) # 挑选出的1600个样本作为传播的起点
    new_index = np.concatenate((as_labeled_indices, test_indices),axis=0)# 总共的2000样本索引
    new_x = X[new_index]
    new_y = y[new_index]
    new_y[1600:] = -1

    lp_model.fit(new_x, new_y)
    predicted_labels = lp_model.transduction_[1600:]
    true_labels = y[test_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                              labels=lp_model.classes_)
    SC = accuracy_score(true_labels, predicted_labels)
    print(i,gammas[i],SC)
    print(classification_report(true_labels, predicted_labels, digits=4))
    # print("Confusion matrix")
    print(cm)

0 0.01 0.39
              precision    recall  f1-score   support

         0.0     0.3510    0.9921    0.5185       127
         1.0     0.7317    0.2941    0.4196       102
         2.0     0.0000    0.0000    0.0000       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.3900       400
   macro avg     0.2707    0.3216    0.2345       400
weighted avg     0.2980    0.3900    0.2716       400

[[126   1   0   0]
 [ 72  30   0   0]
 [108   5   0   0]
 [ 53   5   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1 0.010974987654930561 0.3875
              precision    recall  f1-score   support

         0.0     0.3483    0.9764    0.5135       127
         1.0     0.7045    0.3039    0.4247       102
         2.0     0.0000    0.0000    0.0000       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.3875       400
   macro avg     0.2632    0.3201    0.2345       400
weighted avg     0.2902    0.3875    0.2713       400

[[124   3   0   0]
 [ 71  31   0   0]
 [108   5   0   0]
 [ 53   5   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2 0.012045035402587823 0.41
              precision    recall  f1-score   support

         0.0     0.3705    0.9685    0.5359       127
         1.0     0.6909    0.3725    0.4841       102
         2.0     0.2308    0.0265    0.0476       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.4100       400
   macro avg     0.3230    0.3419    0.2669       400
weighted avg     0.3590    0.4100    0.3071       400

[[123   4   0   0]
 [ 64  38   0   0]
 [102   8   3   0]
 [ 43   5  10   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


3 0.013219411484660288 0.4775
              precision    recall  f1-score   support

         0.0     0.4216    0.9528    0.5845       127
         1.0     0.6875    0.4314    0.5301       102
         2.0     0.5306    0.2301    0.3210       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.4775       400
   macro avg     0.4099    0.4036    0.3589       400
weighted avg     0.4591    0.4775    0.4115       400

[[121   6   0   0]
 [ 58  44   0   0]
 [ 79   8  26   0]
 [ 29   6  23   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4 0.014508287784959394 0.53
              precision    recall  f1-score   support

         0.0     0.4821    0.9528    0.6402       127
         1.0     0.6849    0.4902    0.5714       102
         2.0     0.5395    0.3628    0.4339       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.5300       400
   macro avg     0.4266    0.4514    0.4114       400
weighted avg     0.4801    0.5300    0.4715       400

[[121   6   0   0]
 [ 52  50   0   0]
 [ 61  11  41   0]
 [ 17   6  35   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5 0.015922827933410922 0.545
              precision    recall  f1-score   support

         0.0     0.5069    0.8661    0.6395       127
         1.0     0.6304    0.5686    0.5979       102
         2.0     0.5495    0.4425    0.4902       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.5450       400
   macro avg     0.4217    0.4693    0.4319       400
weighted avg     0.4769    0.5450    0.4940       400

[[110  16   1   0]
 [ 42  58   2   0]
 [ 51  12  50   0]
 [ 14   6  38   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


6 0.01747528400007684 0.5575
              precision    recall  f1-score   support

         0.0     0.5317    0.8583    0.6566       127
         1.0     0.6316    0.5882    0.6091       102
         2.0     0.5400    0.4779    0.5070       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.5575       400
   macro avg     0.4258    0.4811    0.4432       400
weighted avg     0.4824    0.5575    0.5070       400

[[109  16   2   0]
 [ 39  60   3   0]
 [ 46  13  54   0]
 [ 11   6  41   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7 0.019179102616724886 0.61
              precision    recall  f1-score   support

         0.0     0.6452    0.7874    0.7092       127
         1.0     0.6381    0.6569    0.6473       102
         2.0     0.5500    0.6814    0.6087       113
         3.0     0.0000    0.0000    0.0000        58

    accuracy                         0.6100       400
   macro avg     0.4583    0.5314    0.4913       400
weighted avg     0.5229    0.6100    0.5622       400

[[100  18   9   0]
 [ 27  67   8   0]
 [ 22  14  77   0]
 [  6   6  46   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8 0.02104904144512021 0.615
              precision    recall  f1-score   support

         0.0     0.6621    0.7559    0.7059       127
         1.0     0.6389    0.6765    0.6571       102
         2.0     0.5479    0.7080    0.6178       113
         3.0     1.0000    0.0172    0.0339        58

    accuracy                         0.6150       400
   macro avg     0.7122    0.5394    0.5037       400
weighted avg     0.6729    0.6150    0.5711       400

[[96 19 12  0]
 [24 69  9  0]
 [19 14 80  0]
 [ 6  6 45  1]]
9 0.023101297000831605 0.64
              precision    recall  f1-score   support

         0.0     0.7111    0.7559    0.7328       127
         1.0     0.6486    0.7059    0.6761       102
         2.0     0.5608    0.7345    0.6360       113
         3.0     0.8333    0.0862    0.1563        58

    accuracy                         0.6400       400
   macro avg     0.6885    0.5706    0.5503       400
weighted avg     0.6704    0.6400    0.6074       400

[[96 19 12  0