In [1]:
import pandas as pd

df = pd.read_csv('selected-ann.csv')

stop_words = open('stopwords.txt', 'r', encoding='utf8').read().split('\n')
for index, row in df.iterrows():
    content = row['content']
    for stop_word in stop_words:
        content = content.replace(stop_word, '')
    df.iloc[index]['content'] = content

from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('sgns.baidubaike.bigram-char', binary=False)  # 确保路径是正确的

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [2]:
import numpy as np
import jieba

MAX_SEQ_LEN = 500  # 假设我们设置的最大序列长度为100
VECTOR_DIM = 300  # 假设词向量的维度为300

def text_to_fixed_sequence(text, model, max_seq_len, vector_dim):
    words = list(jieba.cut(text))
    vectors = []
    
    for word in words:
        try:
            vec = model[word]
            vectors.append(vec)
        except KeyError:  # 如果词不在模型的词汇表中
            pass

    # 如果文本的长度小于max_seq_len，那么进行填充
    while len(vectors) < max_seq_len:
        vectors.append(np.zeros(vector_dim))

    # 如果文本的长度大于max_seq_len，那么进行裁剪
    return np.array(vectors[:max_seq_len])

# 将每个文本转化为等长的向量序列
df['vector_sequence'] = df['content'].apply(lambda x: text_to_fixed_sequence(x, word_vectors, MAX_SEQ_LEN, VECTOR_DIM))


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.486 seconds.
Prefix dict has been built successfully.


In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.callbacks import EarlyStopping

# 将vector_sequence转换为适当的形式
X = np.stack(df['vector_sequence'].values)
y = df['score'].values
early_stopping = EarlyStopping(monitor='loss', patience=5, verbose=1, restore_best_weights=True)

def get_lstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
    return model

def get_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(128, 3, activation='relu', input_shape=input_shape))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
    return model

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(model_func, model_name):
    trues = []
    preds = []
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model = model_func(X_train[0].shape)
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, callbacks=[early_stopping])

        predictions = model.predict(X_val).flatten()
        # 将预测结果四舍五入
        predictions = np.rint(predictions).astype(int)
        trues = np.concatenate((trues, y_val))
        preds = np.concatenate((preds, predictions))
    # save model to file
    model.save(model_name + '.h5')
    return cohen_kappa_score(trues, preds, weights='quadratic')

lstm_qwk = evaluate_model(get_lstm_model, "lstm")
cnn_qwk = evaluate_model(get_cnn_model, "cnn")

print(f"Average QWK for LSTM: {lstm_qwk}")
print(f"Average QWK for CNN: {cnn_qwk}")


Train on 480 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 00050: early stopping
Train on 480 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100