项目地址: https://github.com/Edward1Chou/SentimentAnalysis

# 读取数据

In [2]:
import pandas as pd
import numpy as np
import jieba

# 从csv中载入数据
def load_data():
    # 通过pandas来处理csv文件中的数据
    neg = pd.read_csv('data/neg.csv', header=None, index_col=None)
    # 设置error_bad_lines=Flase 那么将丢弃读取错误的行
    pos = pd.read_csv('data/pos.csv', header=None, index_col=None, error_bad_lines=False)
    neu = pd.read_csv('data/neutral.csv', header=None, index_col=None)
    
    # 将上面3组数据合并在1个数组中
    # combined长度=上述三者之和
    combined = np.concatenate((pos[0], neu[0], neg[0]))
    # 生成数据集对应的one-hot编码
    y = np.concatenate((2*np.ones(len(pos), dtype=int), 
                       np.ones(len(neu), dtype=int), 
                       np.zeros(len(neg), dtype=int)))
    return combined, y

In [3]:
# 通过结巴分词 将句子进行分词
def tokenizer(text):
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text

In [4]:
# 测试函数
# combined, y = load_data()
# combined = tokenizer(combined)
# print(combined[0])

In [5]:
# 创建字典
from gensim.corpora.dictionary import Dictionary
from gensim.models.word2vec import Word2Vec
import multiprocessing

# 词汇数量
vocab_dim = 100
# 设置出现频率的最低点
n_exposures = 10
# 根据上下文判断的窗口大小
window_size = 7
# 迭代次数
n_iterations = 10
# 工作类型
cpu_count = multiprocessing.cpu_count()

def create_dictionaries(model:Word2Vec=None, combined=None):
    if (model==None) or (combined==None):
        print("没有提供数据")
        return
    
    gensim_dict = Dictionary()
    # document->bag of words(bow)
    # gensim删除了vocab属性 应该使用index_to_key
    # allow_update 通过添加新的单词更新内部corpora的统计数据
    gensim_dict.doc2bow(model.wv.index_to_key, allow_update=True)
    
    # 建立单词与词的转移表
    word2index = {v:k+1 for k, v in gensim_dict.items()}
    # 建立单词与向量的转移表
    word2vector = {word:model.wv[word] for word in word2index.keys()}
    
    # 将combined中的单词转换成对应的index
    def parse_dataset(combined):
        data = []
        for sentence in combined:
            new_text = []
            for word in sentence:
                try:
                    new_text.append(word2index[word])
                except:
                    new_text.append(0)
            data.append(new_text)
        return data
        
    combined = parse_dataset(combined)
    
    return word2index, word2vector, combined
    
# input: combined 是分词的集合
# ouput: index_dict word->index
# word_vectors word->vectors
# combined 记录每个句子中单词的index值
def word2vec_train(combined):
    # 设置模型的训练参数
    model = Word2Vec(vector_size=vocab_dim,
                      min_count = n_exposures,
                      window=window_size,
                      workers=cpu_count)
    # 训练模型
    model.build_vocab(combined)
    model.train(combined, total_examples=model.corpus_count, epochs=n_iterations)
    # 保存模型
    model.save('./model/Word2Vec_model.pkl')
    # 通过创建映射关系
    index_dict, word_vectors, combined = create_dictionaries(model=model, combined=combined)
    
    return index_dict, word_vectors, combined

In [6]:
# 测试函数
# word2vec_train(combined)

combined, y = load_data()
combined = tokenizer(combined)
model = Word2Vec.load('./model/Word2Vec_model.pkl')
# print(model.wv.index_to_key)
index_dict, word_vectors, combined = create_dictionaries(model, combined)



  pos = pd.read_csv('data/pos.csv', header=None, index_col=None, error_bad_lines=False)
Skipping line 2607: expected 1 fields, saw 9
Skipping line 3143: expected 1 fields, saw 2
Skipping line 3173: expected 1 fields, saw 8

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Fucloud\AppData\Local\Temp\jieba.cache
Loading model cost 0.449 seconds.
Prefix dict has been built successfully.


In [7]:
# 通过pytorch 建立模型
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html


# lstm使用 https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
import torch
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence

def get_data(index_dict, word_vectors, combined, y):
    # 所有单词的索引数量
    n_symbols = len(index_dict) + 1
    
    # 初始化词向量
    embedding_weights = torch.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = torch.from_numpy(word_vectors[word])
    
    # 将数据集转换为tensor形式
    combined = pad_sequence([torch.tensor(doc, dtype=torch.long) for doc in combined], batch_first=True)
    y = torch.tensor(y, dtype=torch.long)
    
    # 将数据集拆分成train/valid
    test_rate = 0.2
    test_size = int(len(combined) * test_rate)
    train_size = len(combined) - test_size
    datasets = data.random_split(data.TensorDataset(combined, y), [train_size, test_size])
    train_ds = datasets[0]
    valid_ds = datasets[1]
    return n_symbols, embedding_weights, train_ds, valid_ds



In [8]:
import torch.nn as nn
import torch.nn.functional as F

# 建立一个模型
class AirModel(nn.Module):
    def __init__(self):
        super().__init__()
        # 关于batch_first使用: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
        # 这里是根据第一个维度来进行批量操作
        self.embbed = nn.Embedding(n_symbols, vocab_dim).from_pretrained(embedding_weights)
        self.lstm = nn.LSTM(input_size=vocab_dim*1804,hidden_size=50, num_layers=1, batch_first=True)
        self.lin1 = nn.Linear(50, 50)
        self.lin2 = nn.Linear(50, 50)
        self.lin3 = nn.Linear(50, 3)
    def forward(self, x):
        x = self.embbed(x).view(32, -1)
        x, _ = self.lstm(x)
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        # softmax + log
        x = F.softmax(x, dim=1)
        return x

In [None]:
import torch.optim as optim

def train_lstm(n_symbols, embedding_weights, train_dataset, valid_dataset):
    print('Defining a Simple Torch Model...')
    
    # https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
#     model = nn.Sequential(
#             nn.Embedding(n_symbols, vocab_dim).from_pretrained(embedding_weights),
#             nn.LSTM(vocab_dim, 50, batch_first=True),
#             nn.Linear(50, 50),
#             nn.Linear(50, 50),
#             nn.Linear(50, 50),
#             nn.Softmax(3),
#     )
    model = AirModel()

    optimizer = optim.Adam(model.parameters())
    loss_func = nn.CrossEntropyLoss()
    loader = data.DataLoader(train_dataset, shuffle=True, batch_size=32)
    
    epochs = 200
    for epoch in range(epochs):
        model.train()
            
        
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss = loss_func(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            
        
        if epoch % 100 == 0:
            model.eval()
            with torch.no_grad:
                valid_loader = data.DataLoader(valid_dataset, shuffle=True, batch_size=32)
                losses = [loss(model(x_valid), y_valid) for x_valid, y_valid in valid_loader ]
                loss = np.mean(losses)
                print(epoch, loss)
            print(epoch)
            

# 可以确定一件事 index_dict中没有0

n_symbols, embedding_weights, train_ds, valid_ds = get_data(index_dict, word_vectors, combined, y)

train_lstm(n_symbols, embedding_weights, train_ds, valid_ds)


# embedding = nn.Embedding(num_embeddings=n_symbols, 
#                          embedding_dim=vocab_dim).from_pretrained(embedding_weight)
# print(embedding.weight.shape)
            
            
    

Defining a Simple Torch Model...
32
