In [2]:
import pandas as pd
import os, json

# df = pd.read_csv('selected-ann.csv')
df = pd.DataFrame(columns=['idd', 'content', 'score'])
for idd in range(600):
    filename = "result/enhanced/{}-1.json".format(idd)
    if os.path.exists(filename):
        content = json.load(open(filename, 'r', encoding='utf8'))['response']
        score = int(idd/120)
        # add to df
        df.loc[len(df.index)] = [idd, content, score]

stop_words = open('stopwords.txt', 'r', encoding='utf8').read().split('\n')
maxLen = 0
maxSentences = []
for index, row in df.iterrows():
    content = row['content']
    # 分句
    sentences = content.split('。')
    for stop_word in stop_words:
        index = 0
        for sentence in sentences:
            if stop_word in sentence:
                sentences[index] = sentence.replace(stop_word, '')
            index += 1
    df.iloc[index]['content'] = '。'.join(sentences)
    if len(sentences) > maxLen:
        maxLen = len(sentences)
        maxSentences = sentences

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[index]['content'] = '。'.join(sentences)


In [3]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('./bert-base-chinese')
bert_model = BertModel.from_pretrained('./bert-base-chinese')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
# 定义函数以获取BERT特征
def get_bert_features(sentences):
    MAX_SEQ_LEN = 33    
    # 2. 使用分词器预处理文本
    finalFeatures = [[0 for i in range(768)] for j in range(MAX_SEQ_LEN)]
    index = 0
    for text in sentences.split('。'):
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
        with torch.no_grad():
            # 3. 将文本传递给BERT模型
            outputs = bert_model(**inputs)
        # 取[CLS]标记的输出作为文本表示，返回一维数组
        features = outputs.last_hidden_state[:, 0, :].numpy()
        for i in range(0, len(features[0])):
            finalFeatures[index][i] = features[0][i]
        index += 1
    return finalFeatures
        

# 将df中的内容列应用到get_bert_features函数
df['bert_features'] = df['content'].apply(get_bert_features)

In [4]:
import numpy as np
# 将df中的bert_features列转换为普通的列表
bert_features = np.array(df['bert_features'].tolist())

# 把df保存到本地tsv，不要使用科学计数法
np.set_printoptions(suppress=True)
# 保存df到本地，JSON格式
df.to_json('selected-ann-bert-enhanced.json', orient='records', force_ascii=False, lines=True)

In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, Bidirectional, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.callbacks import EarlyStopping
import ast

# read df from json
df = pd.read_json('selected-ann-bert.json', orient='records', lines=True)

# 将vector_sequence转换为适当的形式
X = np.stack(df['bert_features'].values)
y = df['score'].values
early_stopping = EarlyStopping(monitor='loss', patience=5, verbose=1, restore_best_weights=True)

def get_lstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
    return model

def get_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(128, 3, activation='relu', input_shape=input_shape))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
    return model

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(model_func, model_name):
    trues = []
    preds = []
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model = model_func(X_train[0].shape)
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, callbacks=[early_stopping])
        predictions = model.predict(X_val).flatten()
        predictions = np.rint(predictions).astype(int)
        trues = np.concatenate((trues, y_val))
        preds = np.concatenate((preds, predictions))
        
    model.save(model_name + '.h5')
    return cohen_kappa_score(trues, preds, weights='quadratic')

lstm_qwk = evaluate_model(get_lstm_model, "lstm")
cnn_qwk = evaluate_model(get_cnn_model, "cnn")

print(f"Average QWK for LSTM: {lstm_qwk}")
print(f"Average QWK for CNN: {cnn_qwk}")


Train on 480 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 00061: early stopping
Train on 480 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100