In [2]:
import numpy as np
import pickle
from collections import defaultdict
import sys,re 
import pandas as pd

In [8]:
#加载数据分成10份
def build_data_cv(data_folder,cv=10,clean_string=True):
    revs = []
    pos_file = data_folder[0]
    neg_file = data_folder[1]
    vocab = defaultdict(float)
    with open(pos_file,"rb") as f:
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join('%s'%id for id in rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum = {"y":1,"text":orig_rev,"num_words":len(orig_rev.split()),"split":np.random.randint(0,cv)}
            revs.append(datum)
    with open(neg_file,"rb") as f:
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join('%s'%id for id in rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum = {"y":0,"text":orig_rev,"num_words":len(orig_rev.split()),"split":np.random.randint(0,cv)}
            revs.append(datum)
    return revs,vocab

#获得词向量矩阵
def get_W(word_vecs,k=300):
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1,k),dtype='float32')
    W[0] = np.zeros(k,dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W,word_idx_map

#word2vec 加载300维词向量
def load_bin_vec(fname,vocab):
    word_vecs = {}
    with open(fname,"rb",encoding='utf-8') as f:
        header = f.readline()
        vocab_size,layer1_size = map(int,header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if word in vocab:
                word_vecs[word] = np.fromstring(f.read(binary_len),dtype='float32')
            else:
                f.read(binary_len)
    return word_vecs

#未知词的词向量
def add_unknown_words(word_vecs,vocab,min_df=1,k=300):
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)
            
def clean_str(string,TREC=False):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

def clean_str_sst(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

if __name__=="__main__":
    w2v_file = sys.argv[1]
    data_folder = ["rt-polarity.pos","rt-polarity.neg"]
    print("loading data....")
    revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
    max_l = np.max(pd.DataFrame(revs)["num_words"])
    print("data laoded!")
    print("number of sentences:",str(len(revs)))
    print("vocab size:",str(len(vocab)))
    print("max sentence length:",str(max_l))
    print("loading word2vec vectors...")
    w2v = load_bin_vec(w2v_file,vocab)
    print("word2vec loaded!")
    print("num words already in word2vec:",str(len(w2v)))
    add_unknown_words(w2v,vocab)
    W,word_idx_map = get_W(w2v)
    rand_vecs = {}
    add_unknown_words(rand_vecs,vocab)
    W2,_ = get_W(rand_vecs)
    pickle.dump([revs,W,W2,word_idx_map,vocab],open("mr.p","wb"))
    print("dataset created!")

loading data....


ValueError: binary mode doesn't take an encoding argument