In [1]:
import fasttext
import time

# 数据路径设置
train_data_path = '../data/train_fast.txt'  # 训练集数据路径
dev_data_path = '../data/dev_fast.txt'      # 验证集数据路径，用于模型调参
test_data_path = '../data/test_fast.txt'    # 测试集数据路径，用于评估模型性能

# 训练 FastText 文本分类模型
# autotuneValidationFile: 指定验证数据集路径，用于自动调参
# autotuneDuration: 设置自动调参的最大时间，单位为秒
# wordNgrams: 设置 n-gram 的值，这里为 2 表示使用 bigram 特征
# verbose: 设置日志打印级别，当设置为 3 时，会打印详细的训练和调参日志
model = fasttext.train_supervised(
    input=train_data_path,              # 输入训练数据
    autotuneValidationFile=dev_data_path,  # 使用验证数据进行超参数调节
    autotuneDuration=600,               # 调参时间设置为 600 秒
    wordNgrams=2,                       # 使用 bigram 特征
    verbose=3                           # 打印详细训练日志
)

# 在测试集上评估模型的表现
# test: 测试模型性能并返回 (precision, recall, examples) 元组
result = model.test(test_data_path)
print(result)  # 输出测试结果，包括准确率、召回率和测试样本数量

# 模型保存
# 使用当前时间格式化为 "年月日时分秒"
time_str = time.strftime("%Y%m%d_%H%M%S")
model_save_path = f"./toutiao_fasttext_{time_str}.bin"  # 保存路径带有格式化时间
model.save_model(model_save_path)  # 将模型保存为 .bin 文件




Trial = 1
epoch = 5
lr = 0.1
dim = 100
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 2000000
dsub = 2
loss = softmax


Progress:   0.5% Trials:    1 Best score:   unknown ETA:   0h 9m56s

currentScore = 0.9121
train took = 3.51959
Trial = 2
epoch = 2
lr = 0.169298
dim = 105
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 16
loss = softmax


Progress:   1.3% Trials:    2 Best score:  0.912100 ETA:   0h 9m51s

currentScore = 0.905
train took = 4.70766
Trial = 3
epoch = 1
lr = 0.463758
dim = 52
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 1036295
dsub = 2
loss = softmax


Progress:   1.5% Trials:    3 Best score:  0.912100 ETA:   0h 9m50s

currentScore = 0.9008
train took = 0.874052
Trial = 4
epoch = 4
lr = 0.0527859
dim = 33
minCount = 1
wordNgrams = 2
minn = 3
maxn = 6
bucket = 547156
dsub = 4
loss = softmax


Progress:   1.9% Trials:    4 Best score:  0.912100 ETA:   0h 9m48s

currentScore = 0.8843
train took = 2.50526
Trial = 5
epoch = 8
lr = 0.0136315
dim = 653
minCount = 1
wordNgrams = 2
minn = 3
maxn = 6
bucket = 4235292
dsub = 8
loss = softmax


Progress:   6.4% Trials:    5 Best score:  0.912100 ETA:   0h 9m21s

currentScore = 0.8522
train took = 27.3547
Trial = 6
epoch = 9
lr = 0.164511
dim = 100
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 156820
dsub = 4
loss = softmax


Progress:   7.5% Trials:    6 Best score:  0.912100 ETA:   0h 9m15s

currentScore = 0.9098
train took = 5.96295
Trial = 7
epoch = 2
lr = 0.888362
dim = 30
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 2
loss = softmax


Progress:   7.8% Trials:    7 Best score:  0.912100 ETA:   0h 9m13s

currentScore = 0.9123
train took = 1.87742
Trial = 8
epoch = 3
lr = 0.23899
dim = 8
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 4
loss = softmax


Progress:   8.0% Trials:    8 Best score:  0.912300 ETA:   0h 9m12s

currentScore = 0.9093
train took = 1.09927
Trial = 9
epoch = 1
lr = 2.61261
dim = 29
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 2593702
dsub = 2
loss = softmax


Progress:   8.0% Trials:    9 Best score:  0.912300 ETA:   0h 9m11s

currentScore = 0.9023
train took = 0.810313
Trial = 10
epoch = 1
lr = 1.58274
dim = 3
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 6772140
dsub = 4
loss = softmax


Progress:   8.2% Trials:   10 Best score:  0.912300 ETA:   0h 9m10s

currentScore = 0.8558
train took = 0.581685
Trial = 11
epoch = 15
lr = 0.0493535
dim = 117
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 7955182
dsub = 2
loss = softmax


Progress:  10.7% Trials:   11 Best score:  0.912300 ETA:   0h 8m55s

currentScore = 0.9127
train took = 15.3505
Trial = 12
epoch = 100
lr = 0.0711559
dim = 48
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 2
loss = softmax


Progress:  18.5% Trials:   12 Best score:  0.912700 ETA:   0h 8m 9s

currentScore = 0.9132
train took = 46.4719
Trial = 13
epoch = 45
lr = 0.01
dim = 127
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 4
loss = softmax


Progress:  26.7% Trials:   13 Best score:  0.913200 ETA:   0h 7m19s

currentScore = 0.9105
train took = 49.322
Trial = 14
epoch = 100
lr = 0.160709
dim = 161
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 8
loss = softmax


Progress:  45.6% Trials:   14 Best score:  0.913200 ETA:   0h 5m26s

currentScore = 0.9142
train took = 113.212
Trial = 15
epoch = 74
lr = 0.0202303
dim = 77
minCount = 1
wordNgrams = 2
minn = 3
maxn = 6
bucket = 1586736
dsub = 2
loss = softmax


Progress:  55.3% Trials:   15 Best score:  0.914200 ETA:   0h 4m28s

currentScore = 0.9108
train took = 58.2797
Trial = 16
epoch = 100
lr = 0.124844
dim = 204
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 2
loss = softmax


Progress:  72.2% Trials:   16 Best score:  0.914200 ETA:   0h 2m46s

currentScore = 0.9134
train took = 101.935
Trial = 17
epoch = 100
lr = 0.204853
dim = 160
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 5097515
dsub = 16
loss = softmax


Progress:  86.9% Trials:   17 Best score:  0.914200 ETA:   0h 1m18s

currentScore = 0.9132
train took = 88.0881
Trial = 18
epoch = 29
lr = 0.0931035
dim = 144
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 5759911
dsub = 8
loss = softmax


Progress:  91.3% Trials:   18 Best score:  0.914200 ETA:   0h 0m52s

currentScore = 0.9128
train took = 25.8917
Trial = 19
epoch = 61
lr = 0.213772
dim = 165
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 6326101
dsub = 4
loss = softmax


Progress: 100.0% Trials:   19 Best score:  0.914200 ETA:   0h 0m 0s
Training again with best arguments
Read 3M words
Number of words:  4760
Number of labels: 10


Best selected args = 0
epoch = 100
lr = 0.160709
dim = 161
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 10000000
dsub = 8
loss = softmax


Progress: 100.0% words/sec/thread:   49980 lr:  0.000000 avg.loss:  0.016898 ETA:   0h 0m 0s  50277 lr:  0.075192 avg.loss:  0.029494 ETA:   0h 0m55s 0.018784 ETA:   0h 0m13s


(10000, 0.9179, 0.9179)
