项目地址: https://github.com/Edward1Chou/SentimentAnalysis

# 加载数据
1. 通过pandas加载数据
2. 通过jieba分词
3. 通过gensim训练word2vec模型
4. 创建字典

In [1]:
import pandas as pd
import numpy as np
import jieba

# 从csv中载入数据
def load_data():
    # 通过pandas来处理csv文件中的数据
    neg = pd.read_csv('data/neg.csv', header=None, index_col=None)
    # 设置error_bad_lines=Flase 那么将丢弃读取错误的行
    pos = pd.read_csv('data/pos.csv', header=None, index_col=None, error_bad_lines=False)
    neu = pd.read_csv('data/neutral.csv', header=None, index_col=None)
    
    # 将上面3组数据合并在1个数组中
    # combined长度=上述三者之和
    combined = np.concatenate((pos[0], neu[0], neg[0]))
    # 生成数据集对应的one-hot编码
    y = np.concatenate((2*np.ones(len(pos), dtype=int), 
                       np.ones(len(neu), dtype=int), 
                       np.zeros(len(neg), dtype=int)))
    return combined, y

In [2]:
# 通过结巴分词 将句子进行分词
def tokenizer(text):
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text

In [3]:
# 测试函数
# combined, y = load_data()
# combined = tokenizer(combined)
# print(combined[0])

In [4]:
# 创建字典
from gensim.corpora.dictionary import Dictionary
from gensim.models.word2vec import Word2Vec
import multiprocessing

# 词汇数量
vocab_dim = 100
# 设置出现频率的最低点
n_exposures = 10
# 根据上下文判断的窗口大小
window_size = 7
# 迭代次数
n_iterations = 10
# 工作类型
cpu_count = multiprocessing.cpu_count()

def create_dictionaries(model:Word2Vec=None, combined=None):
    if (model==None) or (len(combined)==0):
        print("没有提供数据")
        return
    
    gensim_dict = Dictionary()
    # document->bag of words(bow)
    # gensim删除了vocab属性 应该使用index_to_key
    # allow_update 通过添加新的单词更新内部corpora的统计数据
    gensim_dict.doc2bow(model.wv.index_to_key, allow_update=True)
    
    # 建立单词与词的转移表
    word2index = {v:k+1 for k, v in gensim_dict.items()}
    # 建立单词与向量的转移表
    word2vector = {word:model.wv[word] for word in word2index.keys()}
    
    # 将combined中的单词转换成对应的index
    def parse_dataset(combined):
        data = []
        for sentence in combined:
            new_text = []
            for word in sentence:
                try:
                    new_text.append(word2index[word])
                except:
                    new_text.append(0)
            data.append(new_text)
        return data
        
    combined = parse_dataset(combined)
    
    return word2index, word2vector, combined
    
# input: combined 是分词的集合
# ouput: index_dict word->index
# word_vectors word->vectors
# combined 记录每个句子中单词的index值
def word2vec_train(combined):
    # 设置模型的训练参数
    model = Word2Vec(vector_size=vocab_dim,
                      min_count = n_exposures,
                      window=window_size,
                      workers=cpu_count)
    # 训练模型
    model.build_vocab(combined)
    model.train(combined, total_examples=model.corpus_count, epochs=n_iterations)
    # 保存模型
    model.save('./model/Word2Vec_model.pkl')
    # 通过创建映射关系
    index_dict, word_vectors, combined = create_dictionaries(model=model, combined=combined)
    
    return index_dict, word_vectors, combined

In [15]:
# 加载数据
sentences, y = load_data()
combined = tokenizer(sentences)

# 训练word2Vec模型
# word2vec_train(combined)

# 加载模型
model = Word2Vec.load('./model/Word2Vec_model.pkl')

# print(model.wv.index_to_key)
index_dict, word_vectors, combined = create_dictionaries(model, combined)



  pos = pd.read_csv('data/pos.csv', header=None, index_col=None, error_bad_lines=False)
Skipping line 2607: expected 1 fields, saw 9
Skipping line 3143: expected 1 fields, saw 2
Skipping line 3173: expected 1 fields, saw 8



## 得到数据集

In [3]:
# 通过pytorch 建立模型
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html

# lstm使用 https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
import torch
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence

device=torch.device("cuda")

def get_data(index_dict, word_vectors, combined, y):
    # 所有单词的索引数量
    n_symbols = len(index_dict) + 1
    
    # 初始化词向量
    embedding_weights = torch.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = torch.from_numpy(word_vectors[word])
    
    # 将数据集转换为tensor形式
    print(combined)
    combined = pad_sequence([torch.tensor(doc, dtype=torch.long, device=device) for doc in combined], batch_first=True)
    y = torch.tensor(y, dtype=torch.long, device=device)
    
    # 将数据集拆分成train/valid
    test_rate = 0.2
    test_size = int(len(combined) * test_rate)
    train_size = len(combined) - test_size
    datasets = data.random_split(data.TensorDataset(combined, y), [train_size, test_size])
    train_ds = datasets[0]
    valid_ds = datasets[1]
    return n_symbols, embedding_weights, train_ds, valid_ds

## 建立模型

In [8]:
import torch.nn as nn
import torch.nn.functional as F

# 建立一个模型
class MyModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # 关于batch_first使用: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
        # 这里是根据第一个维度来进行批量操作
        self.embbed = nn.Embedding(n_symbols, vocab_dim).from_pretrained(embedding_weights)
        self.lstm = nn.LSTM(input_size=vocab_dim*input_size,hidden_size=50, num_layers=1, batch_first=True)
        self.lin1 = nn.Linear(50, 50)
#         self.lin2 = nn.Linear(50, 50)
        self.lin3 = nn.Linear(50, 3)
    def forward(self, x):
        x = self.embbed(x).view(x.shape[0], -1)
        x, _ = self.lstm(x)
        x = F.relu(self.lin1(x))
#         x = F.relu(self.lin2(x))
        x = self.lin3(x)
        # softmax + log
        x = F.softmax(x, dim=1)
        return x
    


In [52]:
import torch.optim as optim

input_size = 1804



def train_lstm(n_symbols, embedding_weights, train_dataset, valid_dataset, is_print=False):
    if is_print:
        print('Defining a Simple Torch Model...')
    
    # https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
#     model = nn.Sequential(
#             nn.Embedding(n_symbols, vocab_dim).from_pretrained(embedding_weights),
#             nn.LSTM(vocab_dim, 50, batch_first=True),
#             nn.Linear(50, 50),
#             nn.Linear(50, 50),
#             nn.Linear(50, 50),
#             nn.Softmax(3),
#     )
    myModel = MyModel(1804)
    myModel.to(device)
    
    optimizer = optim.Adam(myModel.parameters())
    loss_func = nn.CrossEntropyLoss()
    loader = data.DataLoader(train_dataset, shuffle=True, batch_size=32)
    
    epochs = 200
    for epoch in range(epochs):
        myModel.train()
        
        for X_batch, y_batch in loader:
        
            y_pred = myModel(X_batch)
            loss = loss_func(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if epoch % 100 == 0:
            myModel.eval()
            with torch.no_grad():
                valid_loader = data.DataLoader(valid_dataset, shuffle=True, batch_size=32)
                losses = torch.tensor([loss_func(myModel(x_valid), y_valid) for x_valid, y_valid in valid_loader ])
                loss = torch.mean(losses)
                if is_print:
                    print(epoch, loss)
    # 保存模型
    PATH = "./model/lstm.model"
    torch.save(myModel.state_dict(), PATH)
            

# 可以确定一件事 index_dict中没有0
n_symbols, embedding_weights, train_ds, valid_ds = get_data(index_dict, word_vectors, combined, y)

train_lstm(n_symbols, embedding_weights, train_ds, valid_ds)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 

计数数据集

https://blog.csdn.net/m0_37694033/article/details/121977987

https://blog.csdn.net/buluxianfeng/article/details/125731803?spm=1001.2101.3001.6650.1&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1-125731803-blog-121977987.235%5Ev30%5Epc_relevant_default_base3&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1-125731803-blog-121977987.235%5Ev30%5Epc_relevant_default_base3&utm_relevant_index=2

## 预测部分

In [53]:


def input_transform(string):
    words = jieba.lcut(string)
    words = np.array(words).reshape(1, -1)
    model = Word2Vec.load('./model/Word2Vec_model.pkl')
    _, _, combined = create_dictionaries(model, words)
    return combined

def lstm_predict(string):
    combined = input_transform(string)
#     print(combined)
    
    combined[0] = np.append(combined[0], np.zeros(input_size - len(combined[0])))
    
    combined = pad_sequence([torch.tensor(doc, dtype=torch.long, device=device) for doc in combined], batch_first=True)
#     y = torch.tensor(y, dtype=torch.long, device=device)
    
    model = MyModel(1804)
    model.load_state_dict(torch.load(PATH, map_location=device))
    model.to(device)
    
    result = model(combined)
    _, res = torch.max(result, 1)
    if res == 0:
        return "negative"
    elif res == 1:
        return "neutral"
    elif res == 2:
        return "positive"

# for sentence in sentences:
#     lstm_predict(sentence)

# print(sentences[0])
# print(y[0])

sentences = ["太差劲了", "好极了", "声音质量很差，摄像头效果也很差", "音质很好，电池不错，设计大方，还是物有所值。"]

for sentence in sentences:
    print(sentence, ' ', lstm_predict(sentence))
    

load model...
太差劲了   negative
load model...
好极了   positive
load model...
声音质量很差，摄像头效果也很差   negative
load model...
音质很好，电池不错，设计大方，还是物有所值。   positive


模型的输入是1804，但是测试集的长度小于1804