In [1]:
mountedDB = {}

## 欢迎进入 ModelWhale Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Embedding, Dropout, Bidirectional, Layer
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

# 自定义Attention层示例
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name='att_weight',
            shape=(input_shape[-1], input_shape[-1]),
            initializer='glorot_uniform',
            trainable=True
        )
        self.b = self.add_weight(
            name='att_bias',
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True
        )
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs, mask=None):
        # inputs.shape = (batch_size, timesteps, hidden_size)
        x = tf.tensordot(inputs, self.W, axes=[2, 0]) + self.b
        x = tf.nn.tanh(x)
        # 计算注意力分数
        scores = tf.nn.softmax(tf.reduce_sum(x, axis=2), axis=1)
        # 利用注意力分数计算加权输出
        scores = tf.expand_dims(scores, axis=2)
        context = inputs * scores
        return tf.reduce_sum(context, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

# 预处理函数，可根据需要进行扩展
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def load_data(filepath, input_shape=20):
    df = pd.read_csv(filepath, encoding='gbk')

    # 文本清理
    df['evaluation'] = df['evaluation'].apply(preprocess_text)

    # 提取标签和文本（evaluation）
    labels = df['label'].unique().tolist()
    vocabulary_list = df['evaluation'].unique().tolist()

    # 构造字符级别词表
    string = ''
    for text in vocabulary_list:
        string += text
    vocabulary = set(string)

    # 建立映射字典
    word_dict = {word: i + 1 for i, word in enumerate(vocabulary)}
    label_dict = {label: i for i, label in enumerate(labels)}
    output_dict = {i: label for label, i in label_dict.items()}
    vocab_size = len(word_dict)
    label_size = len(label_dict)

    # 保存字典
    with open('word_dict.pk', 'wb') as f:
        pickle.dump(word_dict, f)
    with open('label_dict.pk', 'wb') as f:
        pickle.dump(label_dict, f)

    # 转换文本为索引序列
    x_data = [[word_dict.get(ch, 0) for ch in text] for text in df['evaluation']]
    x_data = pad_sequences(x_data, maxlen=input_shape, padding='post', value=0)

    # 转换标签为独热向量
    y_data = [label_dict[label] for label in df['label']]
    y_data = tf.one_hot(y_data, depth=label_size).numpy()

    return x_data, y_data, vocab_size, label_size, output_dict

def load_pretrained_embedding(embedding_path, word_dict, vocab_size, emb_dim=300):
    """
    读取预训练的词向量，并用其初始化 Embedding 矩阵。
    这里仅做示例，可根据实际预训练文件格式进行修改。
    """
    # 初始化 (vocab_size + 1) * emb_dim 的零矩阵
    embedding_matrix = np.zeros((vocab_size + 1, emb_dim))

    # 假设embedding_path是一个“词\t向量...”格式的txt文件
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            if len(values) < emb_dim + 1:
                continue
            word = values[0]
            coeffs = np.asarray(values[1:], dtype='float32')
            index = word_dict.get(word, None)
            if index is not None and index <= vocab_size:
                embedding_matrix[index] = coeffs
    return embedding_matrix

def create_model(
    input_shape, vocab_size, label_size, emb_dim=300, n_units=128,
    pretrained_weights=None
):
    """
    使用预训练词向量 + 双向 LSTM + Attention 的模型。
    pretrained_weights: 如果为 None，则随机初始化，否则加载预训练。
    """
    # 输入层
    inputs = tf.keras.Input(shape=(input_shape,), name='input_layer')

    # Embedding 层
    if pretrained_weights is not None:
        embedding_layer = Embedding(
            input_dim=vocab_size + 1,
            output_dim=emb_dim,
            weights=[pretrained_weights],
            input_length=input_shape,
            trainable=False,  # 初始阶段可设为 False，仅微调可再设为 True
            mask_zero=True
        )
    else:
        embedding_layer = Embedding(
            input_dim=vocab_size + 1,
            output_dim=emb_dim,
            input_length=input_shape,
            mask_zero=True
        )

    x = embedding_layer(inputs)
    x = Bidirectional(LSTM(n_units, return_sequences=True))(x)
    x = Dropout(0.5)(x)
    # 引入 Attention 机制
    x = AttentionLayer()(x)
    x = Dropout(0.5)(x)
    outputs = Dense(label_size, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

def train_model(
    data_path,
    input_shape=100,
    embedding_path=None,
    emb_dim=300,
    n_units=128,
    batch_size=64,
    epochs=50,
    model_save_path='best_model.h5'
):
    """
    训练流程函数
    """
    x_data, y_data, vocab_size, label_size, output_dict = load_data(data_path, input_shape=input_shape)
    train_x, test_x, train_y, test_y = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

    # 加载预训练向量
    pretrained_weights = None
    if embedding_path and os.path.exists(embedding_path):
        with open('word_dict.pk', 'rb') as f:
            word_dict = pickle.load(f)
        pretrained_weights = load_pretrained_embedding(embedding_path, word_dict, vocab_size, emb_dim=emb_dim)

    model = create_model(
        input_shape=input_shape,
        vocab_size=vocab_size,
        label_size=label_size,
        emb_dim=emb_dim,
        n_units=n_units,
        pretrained_weights=pretrained_weights
    )

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_loss', save_best_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

    model.fit(
        train_x, train_y,
        validation_split=0.1,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, checkpoint, reduce_lr],
        verbose=1
    )

    # 测试
    y_pred = model.predict(test_x)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_true_labels = np.argmax(test_y, axis=1)

    acc = accuracy_score(y_true_labels, y_pred_labels)
    print("测试集准确率:", acc)

    return model

if __name__ == '__main__':
    data_path = '/home/mw/project/datasets.csv'
    emb_path = 'None'  # 如果暂无预训练词向量文件，可设为 None
    trained_model = train_model(
        data_path=data_path,
        input_shape=180,
        embedding_path=emb_path,
        emb_dim=300,
        n_units=128,
        batch_size=64,
        epochs=50,
        model_save_path='best_model.h5'
    )

2025-03-06 11:55:57.371091: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 11:55:57.372641: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-06 11:55:57.375978: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-06 11:55:57.385064: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741262157.400799      67 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741262157.40

Epoch 1/50


2025-03-06 11:56:06.250908: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m1519/1519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9507 - loss: 0.1586



[1m1519/1519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1762s[0m 1s/step - accuracy: 0.9507 - loss: 0.1586 - val_accuracy: 0.9772 - val_loss: 0.0660 - learning_rate: 0.0010
Epoch 2/50
[1m1519/1519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1676s[0m 1s/step - accuracy: 0.9835 - loss: 0.0397 - val_accuracy: 0.9734 - val_loss: 0.0842 - learning_rate: 5.0000e-04
Epoch 13/50
[1m1519/1519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1693s[0m 1s/step - accuracy: 0.9843 - loss: 0.0342 - val_accuracy: 0.9754 - val_loss: 0.0902 - learning_rate: 5.0000e-04
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 163ms/step
测试集准确率: 0.9755833333333334


测试

In [12]:
# Import the necessary modules
import pickle
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# 导入字典
with open('word_dict.pk', 'rb') as f:
    word_dictionary = pickle.load(f)
with open('label_dict.pk', 'rb') as f:
    output_dictionary = pickle.load(f)

try:
    # 数据预处理
    input_shape = 180
    # 在这里改字，可以自己玩一下，效果不太好
    sent = "真不错"
    x = [[word_dictionary[word] for word in sent]]
    x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)

    # 载入模型
    model_save_path = '/home/mw/project/corpus_model.h5'
    lstm_model = load_model(model_save_path)

    # 模型预测
    y_predict = lstm_model.predict(x)
    label_dict = {v:k for k,v in output_dictionary.items()}
    print('输入语句: %s' % sent)
    print('情感预测结果: %s' % label_dict[np.argmax(y_predict)])

except KeyError as err:
    print("您输入的句子有汉字不在词汇表中，请重新输入！")
    print("不在词汇表中的单词为：%s." % err)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
输入语句: 真不错
情感预测结果: 积极
