In [10]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from sklearn.metrics import accuracy_score


# def load_data(file_path):
#     df = pd.read_csv(file_path)
#     # 对标签进行编码
#     df['label'] = df['label'].map({'positive': 1, 'negative': 0})
#     return df

def load_data(file_path):
    df = pd.read_csv(file_path, header=None)
    # 对标签进行编码
    df[df.columns[0]] = df[df.columns[0]].map({'positive': 1, 'negative': 0})
    return df


def preprocess_data(df, w2v_model):
    # 使用word2vec模型将文本转换为词向量
    X = df.drop(df.columns[0], axis=1).apply(lambda x: [w2v_model.wv[word] for word in x if word in w2v_model.wv],
                                             axis=1)

    # 使用零填充使每个样本的长度相同
    X = pad_sequences(X, padding='post')

    return X



In [25]:
def train_model(X_train, y_train, X_val, y_val, activation_func):
    model = Sequential()
    # model.add(Embedding(input_dim=vocabulary_size,
    #                 output_dim=embedding_matrix.shape[1],
    #                 input_length=max_length,
    #                 weights=[embedding_matrix],
    #                 trainable=False))
    model.add(Dense(128, activation=activation_func, kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

    return model


In [26]:
w2v_model = Word2Vec.load('w2v.model')

# 加载数据
train_df = load_data('data2/train.csv')
val_df = load_data('data2/val.csv')

# 预处理数据
X_train = preprocess_data(train_df, w2v_model)
y_train = train_df[train_df.columns[0]]
X_val = preprocess_data(val_df, w2v_model)
y_val = val_df[val_df.columns[0]]

# 训练并保存模型
activation_funcs = ['relu', 'sigmoid', 'tanh']
model_names = ['nn_relu.model', 'nn_sigmoid.model', 'nn_tanh.model']

for activation_func, model_name in zip(activation_funcs, model_names):
    model = train_model(X_train, y_train, X_val, y_val, activation_func)
    model.save(model_name)


TypeError: unhashable type: 'list'

In [27]:
# 第一步：加载所需的库
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Activation


In [28]:
# 第二步：加载数据
def load_data(file_name):
    df = pd.read_csv(file_name, header=None)
    df.columns = ['label'] + list(range(df.shape[1] - 1))
    df['text'] = df[df.columns[1:]].apply(
        lambda x: ' '.join(x.dropna().astype(str)),
        axis=1
    )
    df = df[['label', 'text']]
    df['text'] = df['text'].apply(lambda x: x.split())
    return df


In [29]:
load_data("data2/train.csv")

Unnamed: 0,label,text
0,positive,"[I, went, through, two, others, before, findin..."
1,positive,"[I, was, doubtful, that, anything, could, surv..."
2,positive,"[She, watched, the, accompanying, DVD, video, ..."
3,negative,"[I, use, the, black, but, it's, too, creamy, t..."
4,positive,"[That, would, be, the, only, improvement, I, w..."
...,...,...
639995,positive,"[It, works, well, but, my, personal, preferenc..."
639996,positive,"[The, fact, that, it, has, a, long, handle, ma..."
639997,negative,"[The, battles, feel, like, your, running, thro..."
639998,positive,"[No, it, didn't, last, forever, but, consideri..."


In [30]:
from tinycss2 import tokenizer

# 第三步：加载word2vec模型并准备嵌入矩阵
# def load_word2vec_model(model_name):
#     model = Word2Vec.load(model_name)
#     word_vectors = model.wv
#     vocabulary_size = len(word_vectors.vocab) + 1
#     embedding_matrix = np.zeros((vocabulary_size, word_vectors.vector_size))
#     for word, i in tokenizer.word_index.items():
#         if word in word_vectors:
#             embedding_matrix[i] = word_vectors[word]
#     return embedding_matrix, vocabulary_size
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


def load_word2vec_model(model_name):
    model = Word2Vec.load(model_name)
    word_vectors = model.wv
    vocabulary_size = len(word_vectors.key_to_index) + 1
    embedding_matrix = np.zeros((vocabulary_size, word_vectors.vector_size))
    for word, i in tokenizer.word_index.items():
        if word in word_vectors.key_to_index:
            embedding_matrix[i] = word_vectors.get_vector(word)
    return embedding_matrix, vocabulary_size


In [31]:
# 第四步：定义神经网络模型
def define_model(hidden_activation, vocabulary_size, embedding_matrix):
    model = Sequential()
    # model.add(Embedding(vocabulary_size, embedding_matrix.shape[1],
    #                     weights=[embedding_matrix], trainable=False, input_length=max_length))
    model.add(Embedding(input_dim=vocabulary_size,
                        output_dim=embedding_matrix.shape[1],
                        input_length=max_length,
                        weights=[embedding_matrix],
                        trainable=False))

    model.add(Dense(64, activation=hidden_activation, kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [34]:
# 第五步：加载数据，训练模型并保存
from keras.utils import to_categorical

train_data = load_data('data2/train.csv')
val_data = load_data('data2/val.csv')

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(train_data['text'])
# # max_length = max([len(s.split()) for s in train_data['text']])
# max_length = max([len(s) for s in train_data['text']])
#
# embedding_matrix, vocabulary_size = load_word2vec_model('w2v.model')
#
# X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_length, padding='post')
# X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['text']), maxlen=max_length, padding='post')
#
# le = LabelEncoder()
# y_train = le.fit_transform(train_data['label'])
# y_val = le.transform(val_data['label'])


# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
max_length = max([len(s) for s in train_data['text']])

embedding_matrix, vocabulary_size = load_word2vec_model('w2v.model')

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_length, padding='post')
X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['text']), maxlen=max_length, padding='post')

# Convert labels to one-hot encoding
le = LabelEncoder()
y_train = le.fit_transform(train_data['label'])
y_val = le.transform(val_data['label'])

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

# Now create and train your model

for activation in ['relu', 'sigmoid', 'tanh']:
    model = define_model(activation, vocabulary_size, embedding_matrix)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)
    model.save('nn_' + activation + '.model')




Epoch 1/10


ValueError: in user code:

    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/huangjiabao/.local/share/virtualenvs/PythonCoder-NX0uUe41/lib/python3.9/site-packages/keras/src/backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (64, 2) and (64, 25) are incompatible


In [33]:
# from keras.utils import to_categorical
#
# # Load datasets
# train_data = load_data('data2/train.csv')
# val_data = load_data('data2/val.csv')
#
# # Transform labels into one-hot encoded vectors
# y_train = to_categorical(train_data['label'].map({'positive': 1, 'negative': 0}))
# y_val = to_categorical(val_data['label'].map({'positive': 1, 'negative': 0}))
#
# # Define a tokenizer and fit it on training data
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(train_data['text'])
#
# # Get maximum text length for padding
# max_length = max([len(s) for s in train_data['text']])
#
# # Load the Word2Vec model and get the embedding matrix
# embedding_matrix, vocabulary_size = load_word2vec_model('w2v.model')
#
# # Prepare the input data
# X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_length, padding='post')
# X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['text']), maxlen=max_length, padding='post')
#
# # Create and train the models
# for activation in ['relu', 'sigmoid', 'tanh']:
#     model = define_model(activation, vocabulary_size, embedding_matrix)
#     model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)
#     model.save('nn_' + activation + '.model')
