# 数据预处理

In [70]:
import pandas as pd
import os
import glob
import os
import time
import jieba
import jieba.posseg as pseg
import jieba.analyse
import numpy as np
import torch

## 读取数据并预处理

In [71]:
# use glob to get all the csv files 
# in the folder
path = r"./data/source_data/"
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_all = pd.DataFrame({"弹幕":[],"类别":[]})
print(df_all.columns)
df = 0
for f in csv_files:
    df = pd.read_csv(f) 
    df = df.iloc[:,-2:]
    df.columns = ["弹幕", "类别"]   
    df_all = pd.concat([df_all, df],axis=0)
    # print(df_all["弹幕"][0])  

df_all.dropna(inplace=True) 
df_all["类别"] = df_all["类别"].apply(lambda x: int(x))
df_all.drop_duplicates(subset=["弹幕","类别"],keep='first',inplace=True)
df_all

Index(['弹幕', '类别'], dtype='object')


Unnamed: 0,弹幕,类别
0,你代言都和他接同一家？？？？,2
1,《我 很 内 向》,1
2,不管是不是笋 秀芬看到即爽到,0
3,你好爱他,0
5,西蒙他老婆的战歌,0
...,...,...
1190,啊啊啊鹿晗,0
1192,啊啊啊啊啊啊啊啊，鹿鹿子,3
1194,就尼玛离谱，鹿晗怎么能这么好看,3
1195,完颜团不是吹的,2


In [72]:
# labels = ["高兴","难过","愤怒","惊讶","负样本"]
# for i in range(len(labels)):
#     df_all["类别"][df_all["类别"]==i] = labels[i]
# df_all

## 分词

In [73]:
"""
    加载本地字典：
    【1】自定义字典
    【2】停用词字典
"""
local_dic_name = './data/userdict.txt'
local_stopwords_name = './data/stopwords_dic.txt'
jieba.load_userdict(local_dic_name)
jieba.load_userdict(local_stopwords_name)

In [74]:
dataset_x = []
dataset_y = []
for i in range(len(df_all)):
    str_t = str(df_all.iloc[i]["弹幕"])
    label = int(df_all.iloc[i]["类别"])
    word_list = jieba.lcut(str_t)
    dataset_x.append(word_list)
    dataset_y.append(label)

In [75]:
dataset_x[:5],dataset_y[:5]

([['你', '代言', '都', '和', '他', '接同', '一家', '？', '？', '？', '？'],
  ['《', '我', ' ', ' ', ' ', '很', ' ', ' ', ' ', '内', ' ', ' ', ' ', '向', '》'],
  ['不管', '是不是', '笋', ' ', '秀芬', '看到', '即爽', '到'],
  ['你好', '爱', '他'],
  ['西蒙', '他', '老婆', '的', '战歌']],
 [2, 1, 0, 0, 0])

## 去除停用词(暂时不使用)

In [76]:
# """
#     函数功能：创建停用词list
#     参数：
#         filepath：停用词典地址
#     返回：
#          停用词list
# """
# def stopwordslist(local_stopwords_name):
#     stopwords = [line.strip() for line in open(local_stopwords_name, 'r', encoding='utf-8').readlines()]
#     return stopwords

# stopwords = stopwordslist(local_stopwords_name)

# def word_filter(result):
#     stopwords = stopwordslist(local_stopwords_name)
#     body = ''
#     for w in result:
#         if w.flag != 'x' and w.flag != 'r' and w.flag != 'ul' \
#                 and w.flag != 'uj' and w.flag != 'y' and w.flag != 'q'\
#                 and w.flag != 'd' and w.flag != 'm' and w.flag != 'eng':
#             if w.word not in stopwords:
#                 body += w.word + '\n'
#     # 提取关键词
#     tag = jieba.analyse.extract_tags(body, 5)
#     # print(tag)
#     # 生成关键词比重词典
#     # key = jieba.analyse.textrank(body, topK=100, withWeight=True)
#     # keywords = dict()
#     # for i in key:
#     #    keywords[i[0]] = i[1]
#     # print(keywords)
#     return body


# for i in range(len(dataset)):
#     dataset[i][0] = word_filter(dataset[i][0])

# dataset[:5]

## 根据词频OneHot编码（暂时不用）

In [77]:
# # 将原始训练和测试文本转化为特征向量
# from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# # count_vec=CountVectorizer() #创建词袋数据结构
# count_vec=TfidfVectorizer() #根据词频-逆文档频率
# dataset_count_x = count_vec.fit_transform(dataset_x) 
# dataset_count_x= dataset_count_x.toarray()

## Word2Vec

In [78]:
from gensim.models.word2vec import Word2Vec
# 训练word to vector 的 word embedding
model = Word2Vec(dataset_x,     # 上文处理过的全部语料
                 min_count=1,  # 词频阈值 词出现的频率 小于这个频率的词 将不予保存
                 workers=12, # worker是线程数
                 window=5  # 窗口大小 表示当前词与预测词在一个句子中的最大距离是多少
                 )
model.save('./models/Word2vec_v1')  # 保存模型

In [79]:
import gensim
model = gensim.models.Word2Vec.load("./models/Word2vec_v1")#加载

In [80]:
model.wv.get_vector("狂喜")

array([-0.01764946, -0.00614291, -0.27418613, -0.2186066 ,  0.06597638,
       -0.19655891,  0.01743046,  0.20372109,  0.31656262,  0.03259294,
        0.00404844, -0.13648203, -0.12413061,  0.18599075,  0.02434096,
       -0.00740953, -0.17991793, -0.00444531,  0.1427604 ,  0.1395671 ,
        0.04938675, -0.05333138,  0.06338228,  0.14239158, -0.02861923,
        0.06134237,  0.14270352,  0.08790725, -0.03099502,  0.09653117,
        0.05480843, -0.1184715 ,  0.00252008,  0.02443198, -0.16614774,
       -0.0884072 , -0.24593821, -0.05610939,  0.03413791,  0.02590739,
        0.09474245, -0.1458126 ,  0.01417803,  0.12829913,  0.01174895,
        0.12766293,  0.23352446,  0.12166523, -0.06767468,  0.11412152,
        0.09006815, -0.08509684,  0.17986457, -0.11777854, -0.11041456,
       -0.08263613,  0.17084052, -0.14683399, -0.02112032, -0.08350345,
       -0.16239052,  0.06931621,  0.22068495, -0.1578224 , -0.16823825,
        0.1736363 ,  0.04730841,  0.06706356,  0.02345077,  0.09

In [81]:
dataset_vector_x = []
for sentence in dataset_x:
    vector_x = []
    for word in sentence:
        vector_x.append(model.wv.get_vector(word))
    dataset_vector_x.append(vector_x)

In [82]:
len(dataset_vector_x[0]),len(dataset_vector_x[1])

(11, 15)

## 标签OneHot编码

In [83]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
dataset_y = np.array(dataset_y).reshape(-1,1)
enc.fit(dataset_y)
dataset_onehot_y = enc.transform(dataset_y).toarray()
dataset_onehot_y.shape

(48908, 5)

## 保存数据

In [85]:
torch.save(dataset_vector_x,"dataset/dataset_x")
torch.save(dataset_onehot_y,"dataset/dataset_y")