In [1]:
!pip install transformers
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
import tensorflow_hub as hub
import tensorflow as tf
from transformers import BertTokenizer

In [3]:
df = pd.read_csv("/content/sample_data/data.csv")
df.head(20)

Unnamed: 0,text,label,score
0,趵突泉管好你自己,1,0.646242
1,防李清照的围挡哈哈哈哈,2,0.909193
2,大青岛,2,0.9877
3,扑棱鹅子,2,0.561283
4,轩轩的家乡,2,0.999437
5,小宋老师的姐妹等等,2,0.99213
6,年级第一我要当年级第一,2,0.991552
7,云梦,2,0.921046
8,没办法山东每个城市都很优秀很有特点你说气不气哈哈哈哈哈哈哈,2,0.614598
9,小宋是宋亚轩,2,0.930163


In [4]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2", trainable=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [6]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
    
train_input = bert_encode(train_df['text'].values, tokenizer, max_len=160)
test_input = bert_encode(test_df['text'].values, tokenizer, max_len=160)
train_labels = train_df['label'].factorize()[0]
test_labels = test_df['label'].factorize()[0]

In [7]:
def create_model():
    input_word_ids = tf.keras.layers.Input(shape=(160,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(160,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(160,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(clf_output)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

model = create_model()
model.summary()





Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 160)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 160)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 160)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        102267649   ['input_word_ids[0][0]',         
                                 (None, 160, 768)]                'input_mask[0][0]',         

In [8]:
train_history = model.fit(train_input, train_labels, validation_split=0.2, epochs=1, batch_size=32)



In [9]:
train_pred = model.predict(train_input).ravel()
test_pred = model.predict(test_input).ravel()

catboost_model = CatBoostClassifier(n_estimators=400,learning_rate=0.01,depth=8, verbose=False)
catboost_model.fit(train_pred.reshape(-1, 1), train_labels)

train_pred_catboost = catboost_model.predict(train_pred.reshape(-1, 1))
test_pred_catboost = catboost_model.predict(test_pred.reshape(-1, 1))



#print("Train F1 score:", f1_score(train_labels, train_pred_catboost, average='weighted'))
#print("Test F1 score:", f1_score(test_labels, test_pred_catboost, average='weighted'))



In [10]:
from sklearn.metrics import accuracy_score

# 训练集上的预测
train_acc = accuracy_score(train_labels, train_pred_catboost)

# 测试集上的预测
test_pred_catboost = catboost_model.predict(test_pred.reshape(-1, 1))
test_acc = accuracy_score(test_labels, test_pred_catboost)

print("Test accuracy:", test_acc)

Test accuracy: 0.7840420449116101
