## Import the world 

In [4]:
import pandas as pd
import numpy as np
from scipy import sparse
import os
import re
import sys
import json
from tqdm import tqdm
from pathlib import Path
qlvldir = "/home/aardvark/code/typetokenQLVL/"
sys.path.append(qlvldir)

from qlvl.conf import ConfigLoader
from qlvl import Vocab, TypeTokenMatrix
from qlvl import ItemFreqHandler, ColFreqHandler, TokenHandler
from qlvl import compute_association, compute_cosine, compute_distance, compute_simrank
from qlvl.basics.mxcalc import compute_token_weights, compute_token_vectors
from qlvl.models.typetoken import build_tc_weight_matrix

### Paths

In [5]:
#coldir = "/home/semmetrix/collmtx-stefano/"
rootdir = "/home/artem/"
output_path = rootdir + "output/"
github_dir = "/home/artem/github"

### Parameter settings
Create an object conf to tune settings. During this initialization, conf has already read the default settings file (in `/home/aardvark/code/typetokenQLVL/qlvl/config.ini`) consisting default parameter settings.

In [6]:
conf = ConfigLoader()
settings = conf.settings
fnames = "{}/home/artem/artem_master/dry/j.tsv".format(rootdir)
# always print values to check before you use
settings['line-machine'] = '([^\t]+)\t([^\t]+)\t([^\t])[^\t]*'
settings['line-format'] = 'word,lemma,pos'
settings['token'] = 'lemma/pos/fid/lid'
print(settings['line-machine'])
print(settings['line-format'])
print(settings['type'], settings['colloc'], settings['token'])
settings['file-encoding'] = 'latin1'
settings['outfile-encoding'] = 'utf-8'
print(settings['file-encoding'])
print(settings['outfile-encoding'])
# output_path = settings['output-path']
settings['output-path'] = output_path
print(output_path)
settings['corpus-path'] = '/home/aardvark/corp/en/COCA'
print(settings['corpus-path'])

([^	]+)	([^	]+)	([^	])[^	]*
word,lemma,pos
lemma/pos lemma/pos lemma/pos/fid/lid
latin1
utf-8
/home/artem/output/
/home/aardvark/corp/en/COCA


## Token-level workflow
The type of distributional semantic model used by QLVL on the token level is an adaptation of the approach by Schütze (1998). The model constructs a token vector by averaging over the type vectors of the context words around the target token. As a first step, we have to collect information about the context words that co-occur with specific occurrences (tokens) of a word. This token-by-context information will be stored in a boolean matrix which will then be the input for the averaging over context type vectors.

### Load matrices

In [8]:
freq_fname = "/home/artem/output/COCA.cmplt.10_10.wcmx.freq.pac".format(rootdir)
freqMTX = TypeTokenMatrix.load(freq_fname)
node_vocab = Vocab(freqMTX.sum(axis=1))

In [9]:
print(node_vocab[:10])
len(node_vocab)
target = ['dry/j']

[(',/y', 538315457),('./y', 475155730),('the/a', 471685656) ... ('"/y', 156266380),('in/i', 150189851),('to/t', 136618730)]


### Custom functions

We could select the context words by part-of-speech

In [10]:
def register_cws(tokens, params, init, tokenlist):
    colname1 = '_cws.' + '.'.join(params)
    colname2 = '_count.' + '.'.join(params)
    def count_cws(tokens, row):
        if row in tokens.row_items:
            return [c for c in tokens.col_items if not tokens[row, c] == 0]
        else:
            return []
    if colname1 in init.columns:
        return init
    else:
        df = pd.DataFrame(index = tokenlist) if len(init) == 0 else init
        df[colname1] = [';'.join(count_cws(tokens, r)) for r in df.index]
        df[colname2] = [len(count_cws(tokens, r)) for r in df.index]
        return df

In [11]:
def select_by_ppmi(freqMTX, target, context_words, node_vocab, full):
    subMTX = freqMTX.submatrix(row = target, col = context_words).drop(axis = 1, n_nonzero = 0)
    p = compute_association(subMTX, nfreq=node_vocab, cfreq=node_vocab, meas = 'pmi')
    pdf = p.dataframe.transpose()
    pdf.columns = ['pmi']
    pdf['raw_freq'] = [full[x] for x in pdf.index]
    pdf['raw_co'] = [subMTX[target[0], x] for x in pdf.index]
    pdf['cw'] = pdf.index
    pdf.to_csv("{}/{}.ppmi.tsv".format(output_path, lemma), sep = '\t', index=False)
    print(pdf.head())
    print("Stored with", len(p.col_items), "elements.")
    return p.multiply(p > 0).drop(axis = 1, n_nonzero = 0)

In [12]:
def apply_window(w, maxw, original):
    boolean = original.matrix.toarray() != 0
    if w < maxw:
        threshold = abs(original.matrix.toarray()) < (w+1)
        both = boolean & threshold
        boolean_tokens = sparse.csr_matrix(both.astype(np.int))
    else:
        boolean_tokens = sparse.csr_matrix(boolean.astype(np.int))
    return TypeTokenMatrix(boolean_tokens, original.row_items, original.col_items).drop(axis = 1, n_nonzero = 0)

In [13]:
def restrict_foc(pos, w, foc_cws, weighted):
    foc_cws2 = [x for x in foc_cws if x.rsplit('/', 1)[1] in ['n', 'j', 'v']] if pos == 'nav' else list(foc_cws)
    return list(foc_cws2) if w == 'no_ppmi' else [x for x in foc_cws2 if x in weighted]

In [14]:
def compute_tokweights(original, weights):
    subtwMTX = weights.submatrix(col = original.col_items)
    return compute_token_weights(original, subtwMTX)

In [15]:
def finalize_cloud(token_matrix, soc_matrix, soc_cols):
    ppmiMTX = socMTX.submatrix(row=tokweights.col_items, col=soc_cols)
    tokvecs = compute_token_vectors(tokweights, ppmiMTX)
    return compute_distance(tokvecs)  

In [16]:
def collect_tokens(query, fnames, foc, sentence_boundary, selection, multicore = True):
    if sentence_boundary:
        settings['single-boundary-machine'] = '^</sentence>$'
    else:
        settings['single-boundary-machine'] = '^</artikel>$'
    tokhan = TokenHandler(query, settings=settings)
    tokens = tokhan.retrieve_tokens(col_vocab = foc, multicore = multicore)
    remaining = [x for x in selection if x in tokens.row_items]
    return tokens.submatrix(row = remaining).drop(axis = 1, n_nonzero = 0)
    fnames=None

### Generate foc list

In [17]:
full_name = "/home/artem/output/COCA.foc.nodefreq"
full = Vocab.load(full_name)
full.encoding = 'utf-8'
full

[('be/v', 16767681),('and/c', 11709841),('of/i', 11167127) ... ('paglum/n', 11),('vovan/n', 11),('grafspee/n', 11)]

In [18]:
foc = full[full.match('item', '^[^/]+/[njrvidp]$')]
foc = foc[foc.freq > 10]
len(foc)

165282

In [19]:
socs_fname = ("{}COCA.type.vocab".format(output_path))
ssocs = Vocab.load(socs_fname, encoding=settings["file-encoding"])
ssocs.encoding = "utf-8"
len(ssocs)

2086194

#### Select files and tokens based on concordance

In [20]:
targets = ['dry/j']
lemma = targets[0].split('/')[0]
selection_file = '/home/artem/artem_master/dry/j.tsv'
with open(selection_file, 'r') as s:
    selection = [l.strip().split('\t')[0] for l in s.readlines()[1:]]
selection_file = [x.split('/')[2]+'.conll' for x in selection]
selection = ["/".join([x.split("/")[0], "j", x.split("/")[2], str(int(x.split("/")[3])+1)]) for x in selection]
selection

['dry/j/wlp_fic_1998/2510335',
 'dry/j/wlp_mag_2001/193487',
 'dry/j/wlp_news_2000/2299529',
 'dry/j/wlp_mag_2005/2395753',
 'dry/j/wlp_spok_2005/5054386',
 'dry/j/wlp_fic_1997/1707803',
 'dry/j/wlp_mag_2002/759439',
 'dry/j/wlp_fic_1991/4121017',
 'dry/j/wlp_mag_1990/4035548',
 'dry/j/wlp_news_1996/1870643',
 'dry/j/wlp_fic_1995/2705707',
 'dry/j/wlp_acad_1997/4698397',
 'dry/j/wlp_mag_2004/2595711',
 'dry/j/wlp_mag_2010/3102117',
 'dry/j/wlp_acad_1994/4043938',
 'dry/j/wlp_fic_2002/519949',
 'dry/j/wlp_mag_1998/770974',
 'dry/j/wlp_news_1991/3456153',
 'dry/j/wlp_news_1992/3924109',
 'dry/j/wlp_fic_2000/2020377',
 'dry/j/wlp_mag_1999/1052827',
 'dry/j/wlp_fic_2009/1877358',
 'dry/j/wlp_news_1996/2532840',
 'dry/j/wlp_fic_2007/1965267',
 'dry/j/wlp_fic_2006/4154720',
 'dry/j/wlp_spok_1993/4997011',
 'dry/j/wlp_fic_2008/4043745',
 'dry/j/wlp_mag_1990/287186',
 'dry/j/wlp_spok_2008/3467789',
 'dry/j/wlp_mag_2002/3441446',
 'dry/j/wlp_mag_2008/3804165',
 'dry/j/wlp_news_2011/3426529',
 '

In [64]:
with open('/home/artem/artem_master/dry/j.tsv', 'r') as m:
    numbers = [l.strip().split('\t')[0] for l in m.readlines()[1:]]
tokid=[x.split('/')[3] for x in numbers]
print(tokid[:10])
print(len(tokid))
k = np.unique(np.array(tokid)) 
print(len(k))

['2510334', '193486', '2299528', '2395752', '5054385', '1707802', '759438', '4121016', '4035547', '1870642']
300
300


#### Tokens within sentence boundaries

In [25]:
bound = 'bound'
query = full[target]
files_list = '/home/artem/artem_master/dry/j.tsv'

In [26]:
tokens_bound = collect_tokens(query, fnames, foc, (bound == 'bound'), selection)

Scanning tokens of queries in corpus...
Starting subprocess 35931
Starting subprocess 35933
Starting subprocess 35932
Starting subprocess 35934
Starting subprocess 35930
Starting subprocess 35929
Starting subprocess 35928
Starting subprocess 35935
Starting subprocess 35936
Starting subprocess 35938
Starting subprocess 35939
Starting subprocess 35937
Starting subprocess 35941
Starting subprocess 35944
Starting subprocess 35942
Starting subprocess 35943
Starting subprocess 35945
Starting subprocess 35946
Starting subprocess 35940
Starting subprocess 35947
Starting subprocess 35949
Starting subprocess 35950
Starting subprocess 35948


HBox(children=(IntProgress(value=0, description='  proc(35939)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35944)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35930)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35928)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35938)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35946)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35933)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35945)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35949)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35936)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35931)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35950)', max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35947)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35943)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35937)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35942)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35948)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35941)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35934)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35932)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35935)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35940)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(35929)', max=16), HTML(value='')))

























************************************
function    = retrieve_tokens
  time      = 1.658e+03 sec
************************************



In [27]:
tokens_bound

[300, 1758]                  a/r  abandon/v  abandoned/j  abby/n  abound/v  about/i  about/r  ...
dry/j/wlp_fic_1998/2510335   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2001/193487    NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_news_2000/2299529  NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2005/2395753   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_spok_2005/5054386  NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_fic_1997/1707803   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2002/759439    NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
...                          ...  ...        ...          ...     ...       ...      ...      ...

#### Tokens regardless of sentence boundaries

In [28]:
bound = 'nobound'

In [29]:
tokens_nobound = collect_tokens(query, fnames, foc, (bound == 'nobound'), selection)

Scanning tokens of queries in corpus...
Starting subprocess 38365
Starting subprocess 38364
Starting subprocess 38366
Starting subprocess 38368
Starting subprocess 38367
Starting subprocess 38363
Starting subprocess 38369
Starting subprocess 38372
Starting subprocess 38370
Starting subprocess 38373
Starting subprocess 38378
Starting subprocess 38377
Starting subprocess 38379
Starting subprocess 38374
Starting subprocess 38380
Starting subprocess 38376
Starting subprocess 38375
Starting subprocess 38371
Starting subprocess 38382
Starting subprocess 38381
Starting subprocess 38383
Starting subprocess 38384
Starting subprocess 38385


HBox(children=(IntProgress(value=0, description='  proc(38377)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38363)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38368)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38375)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38380)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38383)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38379)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38371)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38374)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38369)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38373)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38384)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38364)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38367)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38381)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38365)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38378)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38376)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38372)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38370)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38366)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38382)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(38385)', max=15), HTML(value='')))

























************************************
function    = retrieve_tokens
  time      = 1.664e+03 sec
************************************



In [30]:
tokens_nobound

[300, 1758]                  a/r  abandon/v  abandoned/j  abby/n  abound/v  about/i  about/r  ...
dry/j/wlp_fic_1998/2510335   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2001/193487    NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_news_2000/2299529  NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2005/2395753   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_spok_2005/5054386  NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_fic_1997/1707803   NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
dry/j/wlp_mag_2002/759439    NaN  NaN        NaN          NaN     NaN       NaN      NaN      ...
...                          ...  ...        ...          ...     ...       ...      ...      ...

### Vary along parameters

##### PPMI weighting

I use the 10-10 window ppmi frequencies, but could also use 4-4 (it's just... yet another parameter!)

In [28]:
ppmi = select_by_ppmi(freqMTX, target, tokens_nobound.col_items, node_vocab, full)
ppmi # for weighting and selection of foc_words

HBox(children=(IntProgress(value=0, max=1758), HTML(value='')))



************************************
function    = compute_association
  time      = 0.1154 sec
************************************

                  pmi  raw_freq  raw_co           cw
a/r          0.256693    147218     194          a/r
abandon/v   -0.047088     16412      16    abandon/v
abandoned/j  0.979789      3680      10  abandoned/j
abby/n       0.394993      2644       4       abby/n
abound/v     0.995243      2533       7     abound/v
Stored with 1758 elements.


[1, 1156]  a/r       abandoned/j  abby/n      abound/v    about/r     above/r     abs/n      ...
dry/j      0.256693  0.97978854   0.39499322  0.99524295  0.37369668  0.38923192  1.2994517  ...

In [29]:
tokens_registration = pd.DataFrame()
models_registration = pd.DataFrame()

In [30]:
boundaries = ['bound'] # could also be ['bound', 'nobound']
focbows = [10, 5]
focpos = ['nav', 'all']
focweight = ['ppmi_weight', 'ppmi_sel', 'no_ppmi'] # we'll use the ppmi values of the cw10 for weighting any time we weight (**ppmi**)
socbows = [4, 10]
socpos = ['nav', 'all']
min_feats = [1] # could have more values
soclengths = [5000, 10000, 'foc']

In [31]:
foc_combinations = [(pos, w, thres) for pos in focpos for w in focweight for thres in min_feats]
soc_combinations = [(pos, win, length) for pos in socpos for win in socbows for length in soclengths]
print(foc_combinations)
print(soc_combinations)

[('nav', 'ppmi_weight', 1), ('nav', 'ppmi_sel', 1), ('nav', 'no_ppmi', 1), ('all', 'ppmi_weight', 1), ('all', 'ppmi_sel', 1), ('all', 'no_ppmi', 1)]
[('nav', 4, 5000), ('nav', 4, 10000), ('nav', 4, 'foc'), ('nav', 10, 5000), ('nav', 10, 10000), ('nav', 10, 'foc'), ('all', 4, 5000), ('all', 4, 10000), ('all', 4, 'foc'), ('all', 10, 5000), ('all', 10, 10000), ('all', 10, 'foc')]


In [32]:
socMTX = freqMTX.submatrix(row=tokens_nobound.col_items, col=foc.get_item_list())
ppmi = compute_association(socMTX, nfreq=node_vocab, cfreq=node_vocab, meas='ppmi')

HBox(children=(IntProgress(value=0, max=41878453), HTML(value='')))



************************************
function    = compute_association
  time      = 337.1 sec
************************************



In [33]:
settings['left-span']

10

In [34]:
len(tokens_registration)

0

In [85]:
for bound in boundaries:
    tokens = tokens_bound if bound == 'bound' else tokens_nobound
    for ctxt in focbows:
        # apply context window - tokens => new_tokens; foc_cws are the remaining cws
        after_window = apply_window(ctxt, settings['left-span'], tokens)
        foc_1 = after_window.col_items
        for f in foc_combinations:
            fpos, w, t = f
            foc_final = restrict_foc(fpos, w, foc_1, ppmi.col_items)
            tokens_foc_final = after_window.submatrix(col = foc_final).drop(axis = 0, n_nonzero = (t-1))
            tokweights = compute_tokweights(tokens_foc_final, ppmi) if w == 'ppmi_weight' else tokens_foc_final
            # Register first order values
            tokens_registration = register_cws(tokweights, [lemma, 'foc' + str(ctxt), fpos, w, bound], tokens_registration,selection)
            for soc in soc_combinations:
                spos, win, length = soc
                socs = ssocs if spos == 'nav' else foc
                soc_cols = list(tokweights.col_items) if type(length) == str else socs[:length].get_item_list()
                socMTX = ppmi# not perfectly safe
                tokdists = finalize_cloud(tokweights, socMTX, soc_cols)
                modelname = '.'.join([lemma,
                                      "foc" + str(ctxt), fpos, w, bound, 'min'+str(t),
                                      "soc" + str(win), spos, str(length)])
                parameters = {'_model' : modelname, 'type' : lemma,
                              'foc_window' : '{}_{}'.format(ctxt, ctxt), 'foc_part_of_speech' : fpos,
                              'foc_weighting' : w, 'foc_minimum_features': t,
                              'foc_sent_bound' : bound, 'foc_length' : len(tokens_foc_final.col_items),
                              'soc_part_of_speech' : spos, 'soc_window' : '{}_{}'.format(win, win),
                              'soc_length' : length, 'soc_length_num' : len(soc_cols), 
                              'tokens' : str(int(len(tokdists.row_items)))}
                models_registration = models_registration.append(parameters, ignore_index = True)
                tokdists.save("{}/{}/{}.ttmx.dist".format(output_path, lemma, modelname))  

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.09513 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.ppmi_weight.bound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1094 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.ppmi_weight.bound.min1.soc4.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.02238 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.ppmi_weight.bound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01853 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05566 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc10.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.109 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc m

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.11 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.02137 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc10.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05627 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05443 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1202 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc4.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01083 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01109 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05554 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc10.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1213 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc10.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc mat

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1199 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.bound.min1.soc10.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01267 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.bound.min1.soc10.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05493 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.no_ppmi.bound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc m

In [86]:
len(models_registration.drop_duplicates())

144

In [88]:
models_registration = models_registration.drop_duplicates()
models_registration.to_csv("{}/{}/{}.models.tsv".format(github_dir, lemma, lemma), sep="\t", index = False)

In [89]:
len(tokens_registration)

300

In [90]:
tokens_registration.to_csv("{}/{}/{}.cws.tsv".format(github_dir, lemma, lemma), sep='\t', index_label = '_id')

### Test in one

In [31]:
boundaries = ['nobound', 'bound']
focbows = [5]
focpos = ['nav']
focweight = ['ppmi_weight'] # we'll use the ppmi values of the cw10 for weighting any time we weight (**ppmi**)
socbows = [4]
socpos = ['all', 'nav']
min_feats = [1, 2]
soclengths = ['foc']

In [32]:
foc_combinations = [(pos, w, thres) for pos in focpos for w in focweight for thres in min_feats]
soc_combinations = [(pos, win, length) for pos in socpos for win in socbows for length in soclengths]
print(foc_combinations)
print(soc_combinations)

[('nav', 'ppmi_weight', 1), ('nav', 'ppmi_weight', 2)]
[('all', 4, 'foc'), ('nav', 4, 'foc')]


In [33]:
settings['left-span']

10

In [34]:
socMTX = freqMTX.submatrix(row=tokens_nobound.col_items, col=foc.get_item_list())
ppmi = compute_association(socMTX, nfreq=node_vocab, cfreq=node_vocab, meas='ppmi')

HBox(children=(IntProgress(value=0, max=41878453), HTML(value='')))



************************************
function    = compute_association
  time      = 318.6 sec
************************************



In [35]:
lemma

'dry'

In [41]:
lemma = ('dry')
tokens_registration = pd.DataFrame()
models_registration = pd.DataFrame()

boundaries = ['bound', 'nobound'] # could also be ['bound', 'nobound']
focbows = [10, 5]
focpos = ['nav', 'all']
focweight = ['ppmi_weight', 'ppmi_sel', 'no_ppmi'] # we'll use the ppmi values of the cw10 for weighting any time we weight (**ppmi**)
socbows = [4, 10]
socpos = ['nav', 'all']
min_feats = [1] # could have more values
soclengths = ["5000", "10000", 'foc']

foc_combinations = [(pos, w, thres) for pos in focpos for w in focweight for thres in min_feats]
soc_combinations = [(pos, win, length) for pos in socpos for win in socbows for length in soclengths]

for bound in boundaries:
    tokens = collect_tokens(query, files_list, foc, ('bound','nobound'), selection)
    ppmi = select_by_ppmi(freqMTX, target, tokens.col_items, node_vocab, full)
    socMTX = freqMTX.submatrix(row=tokens.col_items, col=foc.get_item_list())

    for ctxt in focbows:
        # apply context window - tokens => new_tokens; foc_cws are the remaining cws
        after_window = apply_window(ctxt, settings['left-span'], tokens)
        foc_1 = after_window.col_items
        for f in foc_combinations:
            fpos, w, t = f
            foc_final = restrict_foc(fpos, w, foc_1, ppmi.col_items)
            tokens_foc_final = after_window.submatrix(col = foc_final).drop(axis = 0, n_nonzero = (t-1))
            tokweights = compute_tokweights(tokens_foc_final, ppmi) if w == 'ppmi_weight' else tokens_foc_final
            # Register first order values
            tokens_registration = register_cws(tokweights, [lemma, 'foc' + str(ctxt), fpos, w, bound], tokens_registration,selection)
            
            for soc in soc_combinations:
                spos, win, length = soc
                socs = ssocs if spos == 'nav' else foc
                soc_cols = socs[:int(length)].get_item_list() if length.isdigit() else list(tokweights.col_items)
                socMTX = ppmi # not perfectly safe
                tokdists = finalize_cloud(tokweights, socMTX, soc_cols)
                modelname = '.'.join([lemma,
                                      "foc" + str(ctxt), fpos, w, bound, 'min'+str(t),
                                      "soc" + str(win), spos, str(length)])
                parameters = {'_model' : modelname, 'type' : lemma,
                              'foc_window' : '{}_{}'.format(ctxt, ctxt), 'foc_part_of_speech' : fpos,
                              'foc_weighting' : w, 'foc_minimum_features': t,
                              'foc_sent_bound' : bound, 'foc_length' : len(tokens_foc_final.col_items),
                              'soc_part_of_speech' : spos, 'soc_window' : '{}_{}'.format(win, win),
                              'soc_length' : length, 'soc_length_num' : len(soc_cols), 
                              'tokens' : str(int(len(tokdists.row_items)))}
                models_registration = models_registration.append(parameters, ignore_index = True)
                tokdists.save("{}/{}/{}.ttmx.dist".format(output_path, lemma, modelname))
models_registration = models_registration.drop_duplicates()
models_registration.to_csv("{}/{}/{}.models.tsv".format(github_dir, lemma, lemma), sep="\t", index = False)

Scanning tokens of queries in corpus...
Starting subprocess 56790
Starting subprocess 56791
Starting subprocess 56792
Starting subprocess 56793
Starting subprocess 56794
Starting subprocess 56795
Starting subprocess 56802
Starting subprocess 56789
Starting subprocess 56801
Starting subprocess 56800
Starting subprocess 56804
Starting subprocess 56803
Starting subprocess 56805
Starting subprocess 56796
Starting subprocess 56798
Starting subprocess 56807
Starting subprocess 56810
Starting subprocess 56806
Starting subprocess 56811
Starting subprocess 56808
Starting subprocess 56799
Starting subprocess 56797
Starting subprocess 56809


HBox(children=(IntProgress(value=0, description='  proc(56792)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56791)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56800)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56797)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56807)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56811)', max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56802)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56794)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56793)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56810)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56789)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56805)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56806)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56795)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56801)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56804)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56796)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56790)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56799)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56809)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56803)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56798)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(56808)', max=16), HTML(value='')))

























************************************
function    = retrieve_tokens
  time      = 1.661e+03 sec
************************************



HBox(children=(IntProgress(value=0, max=1758), HTML(value='')))



************************************
function    = compute_association
  time      = 0.0867 sec
************************************

                  pmi  raw_freq  raw_co           cw
a/r          0.256693    147218     194          a/r
abandon/v   -0.047088     16412      16    abandon/v
abandoned/j  0.979789      3680      10  abandoned/j
abby/n       0.394993      2644       4       abby/n
abound/v     0.995243      2533       7     abound/v
Stored with 1758 elements.
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.0577 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.ppmi_weight.bound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1508 sec
************************************


Savin

Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1471 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc4.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.02231 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.bound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05737 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.n

Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05591 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc10.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.148 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.bound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01684 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc

Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.009781 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc10.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05563 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.bound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1192 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry

Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1505 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc4.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01289 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.bound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05746 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.n

Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.bound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05664 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.bound.min1.soc10.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1211 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.bound.min1.soc10.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01085 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.

HBox(children=(IntProgress(value=0, description='  proc(58978)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58986)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58995)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58981)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58984)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58980)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58979)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58988)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58987)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58975)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58990)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58977)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58976)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58985)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58974)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58996)', max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58991)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58983)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58989)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58982)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58992)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58994)', max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, description='  proc(58993)', max=16), HTML(value='')))

























************************************
function    = retrieve_tokens
  time      = 1.621e+03 sec
************************************



HBox(children=(IntProgress(value=0, max=1758), HTML(value='')))



************************************
function    = compute_association
  time      = 0.08694 sec
************************************

                  pmi  raw_freq  raw_co           cw
a/r          0.256693    147218     194          a/r
abandon/v   -0.047088     16412      16    abandon/v
abandoned/j  0.979789      3680      10  abandoned/j
abby/n       0.394993      2644       4       abby/n
abound/v     0.995243      2533       7     abound/v
Stored with 1758 elements.
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05737 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.ppmi_weight.nobound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1509 sec
************************************


S


************************************
function    = compute_distance
  time      = 0.05706 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.nobound.min1.soc4.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1465 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.nobound.min1.soc4.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.02177 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.nav.no_ppmi.nobound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function 


************************************
function    = compute_distance
  time      = 0.01725 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.nobound.min1.soc4.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05625 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.nobound.min1.soc10.nav.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.147 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc10.all.ppmi_sel.nobound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
funct

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1186 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.nobound.min1.soc10.nav.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.009624 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.nobound.min1.soc10.nav.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05558 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.ppmi_weight.nobound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight 

  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05778 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.nobound.min1.soc4.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1482 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.nobound.min1.soc4.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01298 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.nav.no_ppmi.nobound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc

Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.nobound.min1.soc4.all.10000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.01074 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.nobound.min1.soc4.all.foc.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.05669 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.ppmi_sel.nobound.min1.soc10.all.5000.ttmx.dist.pac
  Operation: 'token-feature weight matrix' X 'socc matrix'...

************************************
function    = compute_distance
  time      = 0.1206 sec
************************************


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.

In [37]:
"home/artem/output/COCA.cmplt.10_10.wcmx.freq.pac".encode("utf-8")

b'home/artem/output/COCA.cmplt.10_10.wcmx.freq.pac'


Saving matrix...
Stored in file:
  /home/artem/output//dry/dry.foc5.all.no_ppmi.nobound.min1.soc10.all.foc.ttmx.dist.pac
