#第一部分：fasttext工具基本使用方法和简单文本分类

In [None]:
#使用fasttext工具进行文本分类的过程：
#第一步：获取数据
#第二步：训练集与验证集的划分
#第三步：训练模型
#第四步：使用模型进行预测并评估
#第五步：模型调优
#第六步：模型保存与重加载

import fasttext

In [None]:
#安装正确的版本组合
!pip install "numpy==1.23.5" "fasttext==0.9.2"

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting fasttext==0.9.2
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext==0.9.2)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m141.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp311-cp311-linux_x86_64.whl size=4304105 sha256=6c294c9dbf7ae664f0

In [None]:
#第一步：获取数据
!wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz && tar xvzf cooking.stackexchange.tar.gz #web get
!rm -rf cooking.stackexchange.tar.gz


--2025-06-01 01:28:01--  https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.157.254.121, 108.157.254.102, 108.157.254.15, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.157.254.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457609 (447K) [application/x-tar]
Saving to: ‘cooking.stackexchange.tar.gz’


2025-06-01 01:28:01 (48.4 MB/s) - ‘cooking.stackexchange.tar.gz’ saved [457609/457609]

cooking.stackexchange.id
cooking.stackexchange.txt
readme.txt


In [None]:
#第二步：训练集与验证集的划分
#fasttext的数据不适合用sklearn的train_test_split划分，直接用命令行

!wc cooking.stackexchange.txt #word count
!head -n 12404 cooking.stackexchange.txt > cooking.train
!tail -n 3000 cooking.stackexchange.txt > cooking.valid


  15404  169582 1401900 cooking.stackexchange.txt


In [None]:
#第三步：训练模型

model = fasttext.train_supervised(input="cooking.train", verbose=2,epoch=30,loss="ova",lr=0.2,wordNgrams=2)
#model = fasttext.train_supervised(input="cooking.train",autotuneValidationFile="cooking.valid",autotuneDuration=300)#自动调参
model.save_model("model_cooking.bin")
# 训练后检查模型
print(f"词汇量: {len(model.words)}")
print(f"标签数: {len(model.labels)}")

词汇量: 8833
标签数: 735


In [None]:
#第四步：使用模型进行预测并评估

predict = model.predict("Bananas are delicious but not suitable for dessert")
print(predict)

test = model.test("cooking.valid")#test输出3个结果
print(f"样本数量: {test[0]}")
print(f"精度: {test[1]*100:.2f}%")
print(f"召回率: {test[2]*100:.2f}%")

(('__label__flavor',), array([0.06955175]))
样本数量: 3000
精度: 60.73%
召回率: 26.26%


In [None]:
#第五步：模型调优（1）
!cat cooking.stackexchange.txt | tr '[:upper:]' '[:lower:]' | tr -d '?.,;:!()[]{}"`' > cleaned.txt
!wc cleaned.txt
!head -n 12404 cleaned.txt > cooking.train
!tail -n 3000 cleaned.txt > cooking.valid

  15404  169550 1386940 cleaned.txt


In [None]:
#第五步：模型调优（2）
#增加而训练epoch数量
#提高学习率
#增加n-gram特征
#尝试不同的损失函数，入hs，ova
#手动调优到达瓶颈，尝试autotuneValidationFile自动调优寻找超参数


#第二部分：训练词向量

In [None]:
!mkdir data
!wget -c http://mattmahoney.net/dc/enwik9.zip -P data
!unzip data/enwik9.zip -d data
# 获取wiki数据处理脚本在 fastText/wikifil.pl
!wget https://raw.githubusercontent.com/facebookresearch/fastText/master/wikifil.pl -P data

--2025-06-01 18:26:17--  http://mattmahoney.net/dc/enwik9.zip
Resolving mattmahoney.net (mattmahoney.net)... 34.198.1.81
Connecting to mattmahoney.net (mattmahoney.net)|34.198.1.81|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322592222 (308M) [application/zip]
Saving to: ‘data/enwik9.zip’


2025-06-01 18:26:38 (15.8 MB/s) - ‘data/enwik9.zip’ saved [322592222/322592222]

Archive:  data/enwik9.zip
  inflating: data/enwik9             


In [None]:
import fasttext

In [None]:
#用官方wiki格式处理脚本处理数据
!perl data/wikifil.pl data/enwik9 > data/fil9

In [None]:
!head -c 80 data/fil9

 anarchism originated as a term of abuse first used against early working class 

In [None]:
#开始训练词向量（使用无监督式学习）
model = fasttext.train_unsupervised(input='data/fil9',dim=300,epoch=1,lr=0.1)

In [None]:
a = model.get_word_vector('the')
print(a.size())
print(a)

In [None]:
#模型效果测试
model.get_nearest_neighbors('sports')
model.get_nearest_neighbors('dog')

In [None]:
#模型保存和上传
model.save_model('model_wiki.bin')
model = fasttext.load_model('model_wiki.bin')
model.get_word_vector('the')#查看和保存前的向量是否一致

#词向量迁移（迁移学习，利用预训练模型）

In [None]:
#词向量迁移
#使用fasttext进行词向量模型迁移：
#第一步：下载预训练词向量模型
#第二步：加载bin文件获取词向量
#第三步：利用邻近词进行效果检验
import fasttext

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz
!gzip -d cc.zh.300.bin.gz

--2025-06-01 19:41:12--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.50, 13.35.7.128, 13.35.7.82, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4478681770 (4.2G) [application/octet-stream]
Saving to: ‘cc.zh.300.bin.gz’


2025-06-01 19:43:27 (31.7 MB/s) - ‘cc.zh.300.bin.gz’ saved [4478681770/4478681770]

tar: This does not look like a tar archive
tar: Skipping to next header
tar: Archive contains ‘H\270[\276N\303\f<\025\313I>’ where numeric off_t value expected
tar: Archive contains ‘\177&\301\274;\266J=\2520\225<’ where numeric off_t value expected
tar: Archive contains ‘\2427\004=W\221\347;tt\231\274’ where numeric off_t value expected
tar: Exiting with failure status due to previous errors


In [None]:
model = fasttext.load_model('cc.zh.300.bin')




In [None]:
model.words[:100]
model.get_nearest_neighbors('apple')
model.get_word_vector('apple')
model.get_nearest_neighbors('成龙')

[(0.6737070679664612, '戆夫'),
 (0.645505964756012, '成龍'),
 (0.6165663599967957, '洪金宝'),
 (0.6154183745384216, '刘德华'),
 (0.6101147532463074, '李连杰'),
 (0.5993714332580566, '杨紫琼'),
 (0.5931445956230164, '望夫'),
 (0.5893030166625977, '谢霆锋'),
 (0.5883947610855103, '李小龙'),
 (0.5879837274551392, '唐季礼')]

#NLP 标准数据集（Glue_dataset）

In [None]:
#下载：https://github.com/nyu-mll/GLUE-baselines/tree/master
!python download_glue_data.py

Downloading and extracting CoLA...
	Completed!
Downloading and extracting SST...
	Completed!
Processing MRPC...
	Error downloading standard development IDs for MRPC. You will need to manually split your data.
Downloading and extracting QQP...
	Completed!
Downloading and extracting STS...
	Completed!
Downloading and extracting MNLI...
	Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.
	Completed!
Downloading and extracting QNLI...
	Completed!
Downloading and extracting RTE...
	Completed!
Downloading and extracting WNLI...
	Completed!
Downloading and extracting diagnostic...
	Completed!


#加载预训练模型和微调

In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses

Collecting boto3
  Downloading boto3-1.38.27-py3-none-any.whl.metadata (6.6 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting botocore<1.39.0,>=1.38.27 (from boto3)
  Downloading botocore-1.38.27-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.38.27-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.38.27-py3-none-any.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')#加载分词器
model = BertModel.from_pretrained('bert-base-chinese')#加载不带头的预训练模型
classmodel = BertForSequenceClassification.from_pretrained('bert-base-chinese')#加载带分类头的预训练模型

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#使用分词器把字符串转为字典index
text = '唯有风暴能够击倒大树，荣誉属于环印城'
token = tokenizer.tokenize(text)
print(token)

#
indexed_tokens = tokenizer.encode(text)
print("indexed_tokens:", indexed_tokens)

token_tensor = torch.tensor([indexed_tokens])
print("token_tensor:",token_tensor)
token_tensor.size()
with torch.no_grad():
  encoder_layers = model(token_tensor)
  #print(encoder_layers)
#print(encoder_layers[0].size())

with torch.no_grad():
  prediction_scores = classmodel(token_tensor)
  print(prediction_scores)
print(prediction_scores[0].size())

['唯', '有', '风', '暴', '能', '够', '击', '倒', '大', '树', '，', '荣', '誉', '属', '于', '环', '印', '城']
indexed_tokens: [101, 1546, 3300, 7599, 3274, 5543, 1916, 1140, 948, 1920, 3409, 8024, 5783, 6289, 2247, 754, 4384, 1313, 1814, 102]
token_tensor: tensor([[ 101, 1546, 3300, 7599, 3274, 5543, 1916, 1140,  948, 1920, 3409, 8024,
         5783, 6289, 2247,  754, 4384, 1313, 1814,  102]])
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1902, -0.6381]]), hidden_states=None, attentions=None)
torch.Size([1, 2])


#迁移学习实践
预训练模型提取特征+自定义全连接层输出

In [None]:
#download source:https://zhuanlan.zhihu.com/p/400624790

In [1]:
"""
中文情感分析核心代码 - 最精简版本
仅包含必要组件、训练评估和一个简单示例
"""
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")


# 使用Bert分词器和预训练模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert = BertModel.from_pretrained('bert-base-chinese').to(device)

# 分类器（连接模型的自定义全连接层）
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(768, 2)

    def forward(self, x):
        return self.fc(x)


# 数据加载
def load_data(file_path, batch_size=16):
    # 读取数据
    df = pd.read_csv(file_path, sep='\t', header=None).drop([0])
    text_col, label_col = 0, 1

    # 生成批次
    def generate_batches():
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size]

            # 提取文本和标签
            texts = batch_df[text_col].astype(str).tolist()
            labels = batch_df[label_col].astype(int).tolist()

            # BERT处理
            encoded = tokenizer(
                texts,
                padding='max_length',
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ).to(device)

            with torch.no_grad():
                outputs = bert(**encoded)
                features = outputs.last_hidden_state[:, 0, :]  # [CLS]标记

            batch_labels = torch.tensor(labels, dtype=torch.long).to(device)
            yield features, batch_labels

    return generate_batches, len(df)

# 训练和评估
def train_and_evaluate(train_path, valid_path):
    # 模型初始化
    model = Classifier().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 数据加载
    train_gen, train_size = load_data(train_path)
    valid_gen, valid_size = load_data(valid_path)

    # 训练
    for epoch in range(4):  # 4个epoch
        print(f"Epoch {epoch+1}/4")

        # 训练阶段
        model.train()
        train_loss = train_correct = 0

        for features, labels in tqdm(train_gen(), desc="训练"):
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_correct += (outputs.argmax(1) == labels).sum().item()

        # 验证阶段
        model.eval()
        valid_loss = valid_correct = 0

        with torch.no_grad():
            for features, labels in tqdm(valid_gen(), desc="验证"):
                outputs = model(features)
                loss = criterion(outputs, labels)

                valid_loss += loss.item()
                valid_correct += (outputs.argmax(1) == labels).sum().item()

        # 计算指标
        train_acc = train_correct / train_size
        valid_acc = valid_correct / valid_size

        print(f"训练准确率: {train_acc:.4f}, 验证准确率: {valid_acc:.4f}")

    # 保存模型
    torch.save(model.state_dict(), "sentiment_model.pth")
    return model

# 预测函数
def predict(text, model):
    model.eval()

    # 处理文本
    encoded = tokenizer(
        [text],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        # BERT特征
        outputs = bert(**encoded)
        features = outputs.last_hidden_state[:, 0, :]

        # 分类
        logits = model(features)
        pred = torch.argmax(logits, dim=1).item()

    return "正面" if pred == 1 else "负面"

# 运行训练和示例
if __name__ == "__main__":
    # 训练模型
    model = train_and_evaluate('train.tsv', 'dev.tsv')

    # 简单示例
    sample_text = "房间很大，服务也很好，下次还会来"
    sentiment = predict(sample_text, model)
    print(f"\n示例: '{sample_text}'")
    print(f"情感预测: {sentiment}")

使用设备: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Epoch 1/4


训练: 185it [00:23,  7.95it/s]
验证: 63it [00:07,  8.02it/s]


训练准确率: 0.8334, 验证准确率: 0.8690
Epoch 2/4


训练: 185it [00:26,  7.03it/s]
验证: 63it [00:08,  7.72it/s]


训练准确率: 0.8730, 验证准确率: 0.8710
Epoch 3/4


训练: 185it [00:23,  7.98it/s]
验证: 63it [00:07,  8.17it/s]


训练准确率: 0.8818, 验证准确率: 0.8760
Epoch 4/4


训练: 185it [00:23,  7.98it/s]
验证: 63it [00:07,  7.99it/s]


训练准确率: 0.8905, 验证准确率: 0.8780

示例: '房间很大，服务也很好，下次还会来'
情感预测: 正面
