In [1]:
# --*-- encoding:utf-8 --*--
import pandas as pd
import numpy as np
import jieba
import re

In [2]:
# loading data
def loadfile():
    neg = pd.read_excel('./neg1.xls', header=None, index=None)
    pos = pd.read_excel('./pos1.xls', header=None, index=None)
    #merge all data
    neg = np.array(neg[0])
    pos = np.array(pos[0])
    return neg,pos

In [3]:
neg, pos = loadfile()

In [4]:
#generating set of disused words
def getstopword(stopwordPath):
    stoplist = set()
    for line in stopwordPath:
        stoplist.add(line.strip())
        # print line.strip()
    return stoplist

In [5]:
#divide the sentence and remove the disused words
def wordsege(text):
    # get disused words set
    stopwordPath = open('./stopwords(ch).txt', 'r')
    stoplist = getstopword(stopwordPath)
    stopwordPath.close()

    # divide the sentence and remove the disused words with jieba,return list
    text_list = []
    for document in text:

        seg_list = jieba.cut(document.strip())
        fenci = []

        for item in seg_list:
            if item not in stoplist and re.match(r'-?\d+\.?\d*', item) == None and len(item.strip()) > 0:
                fenci.append(item)
        # if the word segmentation of the sentence is null,the label of the sentence should be deleted accordingly
        if len(fenci) > 0:
            text_list.append(fenci)
    return text_list

In [6]:
def tokenizer(neg, pos):
    neg_sege = wordsege(neg)
    pos_sege = wordsege(pos)
    combined = np.concatenate((pos_sege,neg_sege))
    # generating label and meging label data
    y = np.concatenate((np.ones(len(pos_sege), dtype=int), np.zeros(len(neg_sege), dtype=int)))
    return combined,y

In [7]:
combined,y = tokenizer(neg, pos)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\chenx\AppData\Local\Temp\jieba.cache
Loading model cost 1.945 seconds.
Prefix dict has been built succesfully.


In [8]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size = 0.3, random_state = 42)



In [9]:
from gensim.models import Word2Vec
model = Word2Vec(combined, size=128, window=5, min_count=5, workers=4)



In [10]:
# vec_size 指的是我们本身vector的size
def transform_to_matrix(x, padding_size=256, vec_size=128):
    res = []
    for sen in x:
        matrix = []
        for i in range(padding_size):
            try:
                matrix.append(model[sen[i]].tolist())
            except:
                # 这里有两种except情况，
                # 1. 这个单词找不到
                # 2. sen没那么长
                # 不管哪种情况，我们直接贴上全是0的vec
                matrix.append([0] * vec_size)
        res.append(matrix)
    return res

In [11]:
x_train = transform_to_matrix(x_train)
x_test = transform_to_matrix(x_test)

  


In [13]:
# 搞成np的数组，便于处理
x_train = np.array(x_train)
x_test = np.array(x_test)

# 看看数组的大小
print(x_train.shape)
print(x_test.shape)

(5467, 256, 128)
(2344, 256, 128)


In [24]:
x_train

array([[[-0.73754823, -0.67077446,  0.52392203, ..., -0.29133576,
          0.53854764, -0.32955688],
        [-0.49326193, -0.37904742,  0.36180976, ...,  0.46447107,
          0.00639839,  0.29083756],
        [-0.83388954,  0.23402041,  0.43273127, ...,  0.14682986,
          0.11302818,  0.83645195],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.48475784,  0.8911072 , -0.70936328, ...,  0.17108437,
         -0.63081181, -1.88813722],
        [-0.12395851,  0.31090653,  0.05899351, ...,  0.07256895,
          0.03908725,  0.11025846],
        [-0.30392766, -0.04698357,  0.26532772, ...,  0.72936624,
         -0.09928361, -0.07671563],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [14]:
# 搞成np的数组，便于处理
y_train = np.array(y_train)
y_test = np.array(y_test)

# 看看数组的大小
print(y_train.shape)
print(y_test.shape)

(5467,)
(2344,)


In [15]:
np.save('./x1_train.npy', x_train)

In [16]:
x_train = np.load('./x1_train.npy')

In [17]:
print(x_train.shape)

(5467, 256, 128)


In [18]:
np.save('./x1_test.npy', x_test)

In [19]:
x_test = np.load('./x1_test.npy')

In [20]:
print(x_test.shape)

(2344, 256, 128)


In [38]:
x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1], x_train.shape[2], 1)
x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1], x_test.shape[2], 1)
#通过print(X_test)观察与前者的区别，就是多了一个括号
print(x_train.shape) 
print(x_test.shape)

(5467, 1, 256, 128, 1)
(2344, 1, 256, 128, 1)


In [21]:
np.save('./y1_train.npy', y_train)

In [22]:
np.save('./y1_test.npy', y_test)