# 数据处理模块

## 目录

* 数据集加载
* 构建word2id并去除低频词
* 构建共现矩阵
* 生成训练集
* 保存结果

In [1]:
from torch.utils import data
import os
import numpy as np
import pickle

In [2]:
min_count = 50

In [3]:
# 数据集加载
data = open("./data/text8.txt").read()
data = data.split()
# 构建word2id并去除低频词
word2freq = {}
for word in data:
    if word2freq.get(word)!=None:
        word2freq[word] += 1
    else:
        word2freq[word] = 1
word2id = {}
for word in word2freq:
    if word2freq[word]<min_count:
        continue
    else:
        if word2id.get(word)==None:
            word2id[word]=len(word2id)
print (len(word2id))
word2id

18497


{'stiff': 1248,
 'luc': 11715,
 'dualism': 10442,
 'hijacking': 8086,
 'scholarship': 10443,
 'sur': 8087,
 'motors': 10445,
 'prescribed': 16480,
 'daoud': 2425,
 'biographer': 315,
 'mars': 7364,
 'indication': 1530,
 'garments': 12947,
 'criterion': 11718,
 'rubin': 5875,
 'pill': 9178,
 'inflict': 9179,
 'abbreviated': 18460,
 'islamist': 10444,
 'mcgill': 15253,
 'associates': 1249,
 'sensitivity': 17971,
 'realities': 16345,
 'fairies': 2386,
 'large': 3628,
 'fletcher': 17290,
 'oh': 15254,
 'resort': 9180,
 'historiography': 9181,
 'playing': 12949,
 'bowls': 6997,
 'jeff': 14144,
 'unpublished': 15665,
 'crow': 4744,
 'dialects': 1250,
 'wycliffe': 1650,
 'genghis': 0,
 'responded': 10448,
 'elephants': 18,
 'builder': 1251,
 'facts': 9198,
 'mere': 1252,
 'guys': 4746,
 'asia': 12950,
 'histories': 8089,
 'skeletons': 1,
 'motivated': 3373,
 'queens': 4747,
 'twenties': 14704,
 'fanny': 13462,
 'wilkes': 4802,
 'spouse': 9202,
 'group': 1253,
 'superstar': 10450,
 'crusade': 

In [4]:
# 构建共现矩阵
vocab_size = len(word2id)
comat = np.zeros((vocab_size,vocab_size))
print(comat.shape)

(18497, 18497)


In [5]:
window_size = 2

In [6]:
for i in range(len(data)):
    if i%1000000==0:
        print (i,len(data))
    if word2id.get(data[i])==None:
        continue
    w_index = word2id[data[i]]
    for j in range(max(0,i-window_size),min(len(data),i+window_size+1)):
        if word2id.get(data[j]) == None or i==j:
            continue
        u_index = word2id[data[j]]
        comat[w_index][u_index]+=1
comat

0 17005207
1000000 17005207
2000000 17005207
3000000 17005207
4000000 17005207
5000000 17005207
6000000 17005207
7000000 17005207
8000000 17005207
9000000 17005207
10000000 17005207
11000000 17005207
12000000 17005207
13000000 17005207
14000000 17005207
15000000 17005207
16000000 17005207
17000000 17005207


array([[ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  2., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  6.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 12.]])

In [7]:
coocs = np.transpose(np.nonzero(comat))
coocs

array([[    0,     0],
       [    0,    79],
       [    0,   462],
       ...,
       [18496, 18486],
       [18496, 18487],
       [18496, 18496]], dtype=int64)

In [9]:
# 生成训练集
labels = []
for i in range(len(coocs)):
    if i%1000000==0:
        print (i,len(coocs))
    labels.append(comat[coocs[i][0]][coocs[i][1]])
labels = np.array(labels)
print (labels.shape)

0 9190921
1000000 9190921
2000000 9190921
3000000 9190921
4000000 9190921
5000000 9190921
6000000 9190921
7000000 9190921
8000000 9190921
9000000 9190921
(9190921,)


In [10]:
labels

array([ 2.,  2.,  1., ...,  1.,  2., 12.])

In [12]:
# 保存结果
np.save("./data/data.npy",coocs)
np.save("./data/label.npy",labels)
pickle.dump(word2id,open("./data/word2id","wb"))