In [17]:
import pandas as pd
import numpy as np
test__file = '/home/ubuntu/wikipedia/DATA/OUTFILE/predicted_enwiki-20161201-pages-meta-history11.xml-p003046514p003201200.7z.tsv'
data = pd.read_csv(test__file, sep='\t')

# Class DATA:
the class data needs to provide following functions:

* **`buildCorpora()`** extracting unique words from user specified columns

In [18]:
import string
def buildCorpora(data, text_cols):
    assert [col in data.columns for col in text_cols]
    
    corpora = set()
    for col in text_cols:
        _corpora = _extractUniqueWordsFromColumn(data, col)
        corpora = corpora.union(_corpora)
    return corpora
        
def _extractUniqueWordsFromColumn(data, text_col):
    _data = data[text_col]
    unique_words = set()
    punc_table = str.maketrans({key: None for key in string.punctuation})
    
    for item in _data.items():
        text = item[1]
        if(type(text) is not str):
            continue
        
        # remove punctuations
        text = text.translate(punc_table)
        for word in text.split():
            if len(word) > 50:
                # eg. "http ... "
                pass
            elif word not in unique_words:
                unique_words.add(word)
            else:
                pass
            
    return unique_words        
        
    
    

### TEST: Will Multiprocessing Help Extracting Unique Words?
Conclusion: the time taken is too small for multiprocessing to take effect

In [8]:
%%time
words1 = _extractUniqueWordsFromColumn(data, 'Added')
words2 = _extractUniqueWordsFromColumn(data, 'Deleted')

CPU times: user 364 ms, sys: 0 ns, total: 364 ms
Wall time: 361 ms


### TEST: buildCorpora
Except the new version excludes a few unnecessary tokens. The rest is pretty good

In [9]:
words_groundtruth = []
with open('test__corpus.txt') as f:
    for line in f:
        words_groundtruth.append(line.strip('\n'))
words_groundtruth = set(words_groundtruth)

In [19]:
words = buildCorpora(data, ['Added', 'Deleted'])

In [12]:
words_groundtruth.difference(words);

* **`getWordVectors()`** extract words vectors using Facebook's library

In [20]:
import os
import subprocess 
import pandas as pd
import numpy as np

def computeVectors(source_file, mode='', identifier='', ft_dir='.', bin_dir='.'):
    assert os.path.exists(ft_dir)
    assert os,path.exists(bin_dir)
    assert mode in ['word', 'sentence']
    
    ft_dir = os.path.join(ft_dir, 'fastText', 'fasttext')
    bin_dir = os.path.join(bin_dir, 'wiki.en.bin')
    target_dir = '__vectors_%s.txt'%identifier
    func = 'print-%s-vectors'%(mode)
    
    # save the file to txt
    if mode is 'word':
        source_dir = '__source_file_%s.txt'%identifier
        assert type(source_file) is set
        with open(source_dir,'w') as file:
            file.writelines(["%s\n" % item  for item in source_file])
    if mode is 'sentence':
        source_dir = '__source_file_%s.csv'%identifier
        assert type(source_file) is pd.core.series.Series
        source_file.to_csv(source_dir, index=False)
            
    # compute the word vectors
    command = "sh -c \'%s %s %s < %s > %s\' "%(ft_dir, 
                                                    func, 
                                                    bin_dir, 
                                                    source_dir, 
                                                    target_dir)
    
    print('Copy files to UNC server')
    print('scp -i ~/alexguo.pem',
          'ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/%s'%source_dir,
          '.')
    print('\n Execute the files')
    print(command)
    print('\n Copy them back')
    print('scp -i ~/alexguo.pem',
          '%s'%target_dir, 
          'ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/%s'%target_dir)
    print('\n Excute in AWS')
    print('sudo mv ~/%s .'%target_dir)
    
    print('Please Use UNC Server')
    
    # subprocess.call(command, shell=True)
    
    # load them back
    #lookup_dict = loadWordVectors(file=target_dir)    

    #return lookup_dict
    
    
def loadVectors(file, mode, dim=300):
    assert mode in ['word', 'sentence']
    lookup_dict = {}
    
    if mode is 'word':
        with open(file) as f:
            for line in f:
                key = line.split()[0]
                values = line.split()[-dim:]
                values = [float(v) for v in values]
                lookup_dict[key] = np.array(values)
                
    if mode is 'sentence':
        with open(file) as f:
            idx = 0
            for line in f:
                key = idx
                values = line.split()[-dim:]
                
                if values[0] in ['-nan' or 'nan']: 
                    lookup_dict[key] = np.zeros([dim])
                else:
                    values = [float(v) for v in values]
                    lookup_dict[key] = np.array(values)
                    
                idx += 1
            
    return lookup_dict

### TEST: get/loadWordVectors
It seems like the official release of `wiki.en.bin` has changed. This is confirmed by running fasttext on command line. The word vector for 'a' was changed.

In [21]:
%%time
wordvecs_groundtruth = loadVectors('test__wordvecs.txt', mode='word')
computeVectors(words, mode='word')

Copy files to UNC server
scp -i ~/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/__source_file_.txt .

 Execute the files
sh -c './fastText/fasttext print-word-vectors ./wiki.en.bin < __source_file_.txt > __vectors_.txt' 

 Copy them back
scp -i ~/alexguo.pem __vectors_.txt ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/__vectors_.txt

 Excute in AWS
sudo mv ~/__vectors_.txt .
Please Use UNC Server
CPU times: user 3.46 s, sys: 88 ms, total: 3.55 s
Wall time: 3.49 s


In [22]:
wordvecs = loadVectors('__vectors.txt', mode='word')

In [12]:
print('Ground Truth:')
print(wordvecs_groundtruth.get('a')[:10])
print('\n\nPredicted')
print(wordvecs.get('a')[:10])

Ground Truth:
[ 0.11559    0.30192   -0.11465    0.01001   -0.032187  -0.10755    0.060674
 -0.10477    0.17488    0.0081116]


Predicted
[-0.02581     0.023828   -0.0094851   0.034731    0.017378    0.00047618
 -0.0075925  -0.068494    0.041394   -0.0015672 ]


* **`_computeSentenceVectorsUncombined()`** extract words vectors using Facebook's library

In [23]:
def _computeSentenceVectorsUncombined(data, text_cols):
    assert [col in data.columns for col in text_cols]
    
    for col in text_cols:
        texts = data[col]
        # replace nan with ''
        texts_cleaned = texts.apply(lambda x: "" if type(x) is not str else x)
        # remove empty lines within one row
        texts_cleaned = texts_cleaned.apply(lambda x: x.replace('\n', ''))
        computeVectors(texts_cleaned, mode='sentence', identifier=col)

### TEST: _computeSentenceVectorsUncombined
Passed

In [24]:
ca = _computeSentenceVectorsUncombined(data, ['Added'])
cd = _computeSentenceVectorsUncombined(data, ['Deleted'])
sentenceVec_a = loadVectors('__vectors_Added.txt', mode='sentence')
sentenceVec_d = loadVectors('__vectors_Deleted.txt', mode='sentence')

Copy files to UNC server
scp -i ~/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/__source_file_Added.csv .

 Execute the files
sh -c './fastText/fasttext print-sentence-vectors ./wiki.en.bin < __source_file_Added.csv > __vectors_Added.txt' 

 Copy them back
scp -i ~/alexguo.pem __vectors_Added.txt ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/__vectors_Added.txt

 Excute in AWS
sudo mv ~/__vectors_Added.txt .
Please Use UNC Server
Copy files to UNC server
scp -i ~/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/__source_file_Deleted.csv .

 Execute the files
sh -c './fastText/fasttext print-sentence-vectors ./wiki.en.bin < __source_file_Deleted.csv > __vectors_Deleted.txt' 

 Copy them back
scp -i ~/alexguo.pem __vectors_Deleted.txt ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/__vectors_Deleted.txt

 Excute in AWS
sudo mv ~/__vectors_Deleted.txt .
Please Use

In [55]:
sentenceVec_d.get(3);

* **`tsneDimReduction()`** dimensionality reduction using TSNE

In [25]:
from MulticoreTSNE import MulticoreTSNE as TSNE
def tsneDimReduction(data, new_dim=2):
    model = TSNE(n_jobs=16, n_components=new_dim)
    vec_tsne = model.fit_transform(data)
    return vec_tsne

def to_csv(data, filedir, drop_text=True):
    if drop_text is True:
        _data = data.drop('text', axis=1)
        _data.to_csv(filedir)
    else:
        data.to_csv(filedir)
    
    print('scp -i ~/Desktop/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/%s ~/Documents/UNC/Research'%filedir)

### TEST: tsneDimReduction
passed

In [15]:
import pandas as pd
dictDF = pd.DataFrame.from_dict(sentenceVec, orient='index')
sentenceVec_TSNE = tsneDimReduction(dictDF.values)
data['sentenceVecDim1'] = sentenceVec_TSNE[:,0]
data['sentenceVecDim2'] = sentenceVec_TSNE[:,1]
to_csv(data, filedir='test__sentenceVec.csv', drop_text=True)

In [22]:
to_csv(data, filedir='test__sentenceVec.csv', drop_text=True)

scp -i ~/Desktop/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/test__sentenceVec.csv ~/Documents/UNC/Research


* **`computeSentenceVectors()`** combine wordvectors 

In [None]:
def computeSentenceVectors(data, dim=300):
    assert [os.path.exists('_vectors_%s.txt'%f) for f in ['Added', 'Deleted']]
    assert 'title' in data.columns
    
    sentenceVec_Added = loadVectors('__vectors_Added.txt', mode='sentence')
    sentenceVec_Deleted = loadVectors('__vectors_Deleted.txt', mode='sentence')
    
    title_prev = ''
    sentenceVectors = np.zeros([data.shape[0], dim])
    sentenceVectorsCentered = np.zeros([data.shape[0], dim])
    
    for row in data.iterrows():
        title = row[1]['title']
        idx = row[0]
        delta_vector = sentenceVec_Added.get(idx) - sentenceVec_Deleted.get(idx)

        if title is not title_prev:
            sentenceVectors[idx, :] = delta_vector
            sentenceVectorsCentered[idx, :] = np.zeros([dim])
        if title is title_prev:
            sentenceVectors[idx, :] = sentenceVectors[idx-1, :] + delta_vector
            sentenceVectorsCentered[idx, :] = sentenceVectorsCentered[idx-1, :] + delta_vector
            
        title_prev = title
    
    sentenceVectors_TSNE = tsneDimReduction(sentenceVectors)
    sentenceVectorsCentered_TSNE = tsneDimReduction(sentenceVectorsCentered)
    
    data['sentenceVecDim1'] = sentenceVectors_TSNE[:,0]
    data['sentenceVecDim2'] = sentenceVectors_TSNE[:,1]
    data['sentenceVecCenteredDimCenter1'] = sentenceVectorsCentered_TSNE[:,0]
    data['sentenceVecCenteredDimCenter2'] = sentenceVectorsCentered_TSNE[:,1]
    
    return data
            
            
            

### TEST: computeSentenceVectors

In [40]:
newdata = computeSentenceVectors(data)

In [29]:
to_csv(newdata,'test__sentenceVec.csv', drop_text=True)

scp -i ~/Desktop/alexguo.pem ubuntu@ec2-52-87-42-119.compute-1.amazonaws.com:/home/ubuntu/wikipedia/Visualization/test__sentenceVec.csv ~/Documents/UNC/Research
