项目地址: https://github.com/Edward1Chou/SentimentAnalysis

# 读取数据

In [4]:
import pandas as pd
import numpy as np
import jieba

# 从csv中载入数据
def load_data():
    # 通过pandas来处理csv文件中的数据
    neg = pd.read_csv('data/neg.csv', header=None, index_col=None)
    # 设置error_bad_lines=Flase 那么将丢弃读取错误的行
    pos = pd.read_csv('data/pos.csv', header=None, index_col=None, error_bad_lines=False)
    neu = pd.read_csv('data/neutral.csv', header=None, index_col=None)
    
    # 将上面3组数据合并在1个数组中
    # combined长度=上述三者之和
    combined = np.concatenate((pos[0], neu[0], neg[0]))
    # 生成数据集对应的onthot编码
    y = np.concatenate((np.ones(len(pos), dtype=int), 
                       np.zeros(len(neu), dtype=int), 
                       -1*np.ones(len(neg), dtype=int)))
    return combined, y

In [5]:
# 通过结巴分词 将句子进行分词
def tokenizer(text):
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text

In [97]:
# 测试函数
# combined, y = load_data()
# combined = tokenizer(combined)
# print(combined[0])

In [98]:
# 创建字典
from gensim.corpora.dictionary import Dictionary
from gensim.models.word2vec import Word2Vec
import multiprocessing

# 词汇数量
vocab_dim = 100
# 设置出现频率的最低点
n_exposures = 10
# 根据上下文判断的窗口大小
window_size = 7
# 迭代次数
n_iterations = 10
# 工作类型
cpu_count = multiprocessing.cpu_count()

def create_dictionaries(model:Word2Vec=None, combined=None):
    if (model==None) or (combined==None):
        print("没有提供数据")
        return
    
    gensim_dict = Dictionary()
    # document->bag of words(bow)
    # gensim删除了vocab属性 应该使用index_to_key
    # allow_update 通过添加新的单词更新内部corpora的统计数据
    gensim_dict.doc2bow(model.wv.index_to_key, allow_update=True)
    
    # 建立单词与词的转移表
    word2index = {v:k+1 for k, v in gensim_dict.items()}
    # 建立单词与向量的转移表
    word2vector = {word:model.wv[word] for word in word2index.keys()}
    
    # 将combined中的单词转换成对应的index
    def parse_dataset(combined):
        data = []
        for sentence in combined:
            new_text = []
            for word in sentence:
                try:
                    new_text.append(word2index[word])
                except:
                    new_text.append(0)
            data.append(new_text)
        return data
        
    combined = parse_dataset(combined)
    
    return word2index, word2vector, combined
    
# input: combined 是分词的集合
# ouput: index_dict word->index
# word_vectors word->vectors
# combined 记录每个句子中单词的index值
def word2vec_train(combined):
    # 设置模型的训练参数
    model = Word2Vec(vector_size=vocab_dim,
                      min_count = n_exposures,
                      window=window_size,
                      workers=cpu_count)
    # 训练模型
    model.build_vocab(combined)
    model.train(combined, total_examples=model.corpus_count, epochs=n_iterations)
    # 保存模型
    model.save('./model/Word2Vec_model.pkl')
    # 通过创建映射关系
    index_dict, word_vectors, combined = create_dictionaries(model=model, combined=combined)
    
    return index_dict, word_vectors, combined

In [100]:
# 测试函数
# word2vec_train(combined)

# model = Word2Vec.load('./model/Word2Vec_model.pkl')
# print(model.wv.index_to_key)
# create_dictionaries(model, combined)

In [3]:
# 通过pytorch 建立模型
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html


# lstm使用 https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
import torch
from torch import nn

model = nn.Sequential(
    
)