In [1]:
import numpy as np
from scipy.sparse import lil_matrix, dok_matrix, csr_matrix
from collections import defaultdict, OrderedDict
from random import randint, choice
from pprint import pprint as pp
from array import array
from itertools import product, combinations
from time import perf_counter
from copy import copy
from time import sleep

# Setup (Metrics)

In [18]:
import psutil
import os

def mem():
    """ Use psutil to record memory snapshot. """
    pid = os.getpid()
    p = psutil.Process(pid)
    rss, vms = p.memory_info()
    return vms

class Stats:
    """ Context manager for reporting memory change and time cost. """
    def __enter__(self):
        self.m1 = mem()
        self.t1 = perf_counter()
        
    def __exit__(self, type, value, traceback):
        self.t2 = perf_counter()
        self.m2 = mem()
        print('\nChange in memory: ', end='')
        print('{:.4g} MB'.format((self.m2 - self.m1) / 1024 / 1024))
        print('Time cost (s)   : ', end='')
        print('{:.4g} s\n'.format(self.t2 - self.t1))
    
# Demo!
with Stats():
    x = [0]*100000000  # 100M
    
with Stats():
    del x


Change in memory: 762.9 MB
Time cost (s)   : 0.5349 s


Change in memory: -762.9 MB
Time cost (s)   : 0.2561 s



# Create the list of context blocks

In [3]:
words = [_.strip() for _ in open('/usr/share/dict/words', 'r')]
print('Number of unique words: %d\n' % len(words))
words = words[:61000]  # Truncate the list to be more realistic

Number of unique words: 235886



### Create our own hash for bidirectional lookups

In [4]:
# Give word, get index
# This is the opposite of `words`: give index, get word
wordsd = OrderedDict(zip(words, range(len(words))))

In [5]:
# Test
x = words[1234]
print(x)
print(wordsd[x])
words[32751]

acetonization
1234


'Ceratitis'

## Dataset creation

In [6]:
def make_context_blocks(num_blocks=100000, word_count=(5, 20)):
    context_blocks = []
    for i in range(num_blocks):
        block_size = choice(range(*word_count))
        #
        # Pretty important that `sorted` is called here. This makes 
        # combinations stable later.
        #
        block = sorted(set(choice(words) for i in range(block_size)))
        context_blocks.append(block)
    return context_blocks
             
with Stats():
    context_blocks = make_context_blocks()
    
print('Sample blocks:')
for b in context_blocks[:5]:
    print(' - ', '/'.join(b))


Change in memory: 19.41 MB
Time cost (s)   : 2.272 s

Sample blocks:
 -  Achagua/Alberene/Apatela/Bokharan/Briarean/Camacan/Dob/adultness/ambisporangiate/asepticism/avital/beastly/belduque/boutylka/calvarium/chandleress/commensal/despumate/dishful
 -  antalgol/asbestine/avast/boycotter/championlike/coenobioid/disregarder
 -  acryl/agranulocytosis/amerism/anthropomorphotheist/antiperistatically/antisuffrage/auricularian/banilad/bimotors/boronatrocalcite/bullishly/burseed/cacumination/chordacentrous/disturbance/dollship/ectodactylism
 -  accommodateness/alguazil/antifundamentalist/antitrades/asterial/bannerlike/beautification/blemishment/brachypyramid/brimstony/coapostate/collisive/culmigenous/detruncate/dihydrotachysterol
 -  Celsius/Chlorococcum/adamantoid/alkalinuria/anticardiac/aulacocarpous/brown/calefactive/campylodrome/chamberdeacon/characteristical/chirurgery/commutative/conceitedly/countercompony/crustosis/electrocardiography


### Build a version of context_blocks that is only arrays of arrays

This changes the context blocks, i.e. the list of lists of 5-40 strings, into a two-dimensional array of integers. Each integer is an index into the `idx` hash that was built earlier.

The array **is preallocated** for both rows and columns.  Currently we're using a default of 100 for columns.  This easily covers the 5-40 band, obviously.  We use "-1" as default, and this is used to know which entries are valid words and which are not.

In [9]:
def make_cb_array(context_blocks, max_words_per_block=100):
    # Note: values are initialized to -1.  This is to keep track of 
    # which entries are valid. These will be >=0, and will index into
    # the `words` list.
    context_blocks_array = np.zeros(
        (len(context_blocks), max_words_per_block), 
        dtype='i4') - 1
    for i, block in enumerate(context_blocks):
        for j, word in enumerate(block):
            # wordsd is a reverse lookup. You give the word, it tells
            # you the index in the "words" array.
            context_blocks_array[i, j] = wordsd[word]
    return context_blocks_array

context_blocks_array = make_cb_array(context_blocks)

In [10]:
# Demo
print(context_blocks_array[500])
for i in context_blocks_array[500]:
    if i>-1:
        print(words[i], end=', ')

[13888 36134 46814  4577  7246 12049 13914 15509 15514 18894 23418 25032
 57721 59631 60784    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
    -1    -1    -1    -1]
Artocarpus, Churrigueresque, Cuphea, al, amplexation, apt, arustle, attractionally, attractor, base, blate, botchka, drawout, edibility, eloper, 

# Naive `dict` method.  Dicts inside Dicts

(Also, this is all working with strings.  See further down for using naive dicts but with integers everywhere.)

In [13]:
def method_dict(context_blocks):
    """
    Given a list of blocks (each containing 5-40 words), build a dict that 
    itself contain dicts. the inner dict has a count of the number of associations
    between the outer key and the inner key.
    """
    d = defaultdict(lambda: defaultdict(int))
    for block in context_blocks:
        for w1, w2 in combinations(block, 2):
            d[w1][w2] += 1
    return d

with Stats():
    d = method_dict(context_blocks)
    
associations = sum(len(w2s) for w1, w2s in d.items())
print('associations: ', associations)


Change in memory: 316.6 MB
Time cost (s)   : 4.523 s

associations:  7542089


Using `setdefault` all over the place is slower, but really not by much.

In [14]:
def method_dict2(context_blocks):
    """
    Given a list of blocks (each containing 5-15 words), build a dict that 
    itself contain dicts. the inner dict has a count of the number of associations
    between the outer key and the inner key.
    """
    d = {}
    for block in context_blocks:
        for w1, w2 in combinations(block, 2):
            d.setdefault(w1, {})
            d[w1].setdefault(w2, 0)
            d[w1][w2] += 1
    return d

with Stats():
    d2 = method_dict(context_blocks)
    
associations = sum(len(w2s) for w1, w2s in d2.items())
print('associations: ', associations)


Change in memory: -8 MB
Time cost (s)   : 4.862 s

associations:  7542089


In [15]:
# Show a sample of the resulting dict.
for i, (w1, w2s) in enumerate(d.items()):
    if i > 5:
        break
    print(w1)
    for j, w2 in enumerate(w2s):
        if j > 5:
            break
        print(' '*8, '{:20} {:10}'.format(w2, d[w1][w2]))

celloid
         didascalic                    1
         considerably                  1
         comprehensible                1
         dipropyl                      1
         chikara                       1
         eloquentness                  1
Bosporian
         caraipi                       1
         amygdal                       1
         blockhouse                    1
         Dehaites                      2
         consociational                1
         diverting                     1
dissonant
         downwith                      1
         doctoral                      1
         divulger                      1
         duplicate                     1
         dystome                       1
         effluviography                1
bierbalk
         disposure                     1
         derelictness                  1
         celiagra                      1
         compone                       1
         divisor                       1
         cycloparaff

# Using a counter

In [19]:
from collections import Counter

def method_counter(context_blocks):
    c = Counter()
    for block in context_blocks:
        c.update(combinations(block, 2))
    return c

with Stats():
    cnt = method_counter(context_blocks)
print('Associations:',len(cnt))
print()


Change in memory: 1038 MB
Time cost (s)   : 3.578 s

Associations: 7542089



In [20]:
for iter, ((w1, w2), c) in enumerate(cnt.items()):
    print('{:15}{:15}{:4}'.format(w1, w2, c))
    if iter>5:
        break

boule          cephalagra        1
bagattini      bhagavata         1
bacteriophage  befiddle          1
chidden        cyclospondylous   1
Chrysopsis     chrysenic         1
Cordaites      clairaudiently    1
afortiori      diction           1


# Cython (naive) - Also using dicts

In [21]:
%load_ext cython

In [22]:
%%cython -a

import numpy as np

def method_cython1(list context_blocks):
    """
    Given a list of blocks (each containing 5-40 words), build a dict that 
    itself contain dicts. the inner dict has a count of the number of associations
    between the outer key and the inner key.
    """
    #cdef int n = int(100e6)
    #cdef unsigned int[:] w1 = np.zeros(n, dtype='u4') - 1
    #cdef unsigned int[:] w2 = np.zeros(n, dtype='u4') - 1
    cdef int end = 0, i, j, blen
    cdef list block
    cdef dict out = {}, inner
    cdef str w1, w2
    for block in context_blocks:
        blen = len(block)
        for i in range(blen):
            w1 = block[i]
            inner = out.get(w1) or {}
            for j in range(i+1, blen):
                w2 = block[j]
                if not w2 in inner:
                    inner[w2] = 0
                inner[w2] += 1
            out[w1] = inner
    return out

In [23]:
with Stats():
    d = method_cython1(context_blocks)
    
associations = sum(len(w2s) for w1, w2s in d.items())
print('associations: ', associations)


Change in memory: -156.8 MB
Time cost (s)   : 2.37 s

associations:  7542089


# Numpy

A quick demo of how to use the integer version of the context blocks.

In [29]:
# Take on particular block
a = context_blocks_array[500]
# Take only the assigned words from the block (drop "-1"s)
b = a[a>-1]
print('Words in this block:\n\n',b, end='\n'*2)
x = np.zeros(200, dtype='i4')
x[5:5+len(b)] = b
print('Pair combinations of these words:', end='\n'*2)
for _ in list(combinations(b, 2)):
    print(_, end=",")

Words in this block:

 [13888 36134 46814  4577  7246 12049 13914 15509 15514 18894 23418 25032
 57721 59631 60784]

Pair combinations of these words:

(13888, 36134),(13888, 46814),(13888, 4577),(13888, 7246),(13888, 12049),(13888, 13914),(13888, 15509),(13888, 15514),(13888, 18894),(13888, 23418),(13888, 25032),(13888, 57721),(13888, 59631),(13888, 60784),(36134, 46814),(36134, 4577),(36134, 7246),(36134, 12049),(36134, 13914),(36134, 15509),(36134, 15514),(36134, 18894),(36134, 23418),(36134, 25032),(36134, 57721),(36134, 59631),(36134, 60784),(46814, 4577),(46814, 7246),(46814, 12049),(46814, 13914),(46814, 15509),(46814, 15514),(46814, 18894),(46814, 23418),(46814, 25032),(46814, 57721),(46814, 59631),(46814, 60784),(4577, 7246),(4577, 12049),(4577, 13914),(4577, 15509),(4577, 15514),(4577, 18894),(4577, 23418),(4577, 25032),(4577, 57721),(4577, 59631),(4577, 60784),(7246, 12049),(7246, 13914),(7246, 15509),(7246, 15514),(7246, 18894),(7246, 23418),(7246, 25032),(7246, 57721),(724

### Tools for the numpy work: faster combinations, and `lru_cache`

In [31]:
from scipy.misc import comb
from itertools import chain
from functools import lru_cache

# The basic strategy is to build INDICES of 
# combinations, and then use Numpy's clever
# index assignment to generate the actual 
# combinations arrays.

@lru_cache()
def comb_index(n, k):
    count = comb(n, k, exact=True)
    index = np.fromiter(chain.from_iterable(combinations(range(n), k)), 
                        'i4', count=count*k)
    return index.reshape(-1, k)

def combb(data):
    idx = comb_index(len(data), 2)
    return data[idx]

# It turns out that 2-combinations are efficiently produced via an upper
# triangluar array. Other than that, same as before, we first calculate
# the INDICES array, and then pass that into our data to build the
# actual list of combinations.

@lru_cache()
def comb_index_triu(n, k):
    return np.array(np.triu_indices(n, 1)).T
    
def combtriu(data):
    idx = comb_index_triu(len(data), 2)
    return data[idx]

print('Compare the first few elements of each combinations function:', end='\n\n')
print(combb(b[:3]))
print(combtriu(b[:3]))

Compare the first few elements of each combinations function:

[[13888 36134]
 [13888 46814]
 [36134 46814]]
[[13888 36134]
 [13888 46814]
 [36134 46814]]


## Basic numpy method.  All arrays, uses fast combinations functions.

In [35]:
import numpy as np
from scipy.misc import comb

def method_numpy1(context_blocks_array, max_words_per_block=40):
    """
    We create one, very long array (many rows) with 2 columns.  Every time
    we add a co-occurence, we simply use a new row to record the two words.
    There are some clever tricks inside the sub methods, mostly about how 
    to work with the combinations efficiently, but basically this pretty 
    much just records every co-occurence in a pretty dumb way.
    
    It turns out this is also quite fast.
    
    Note that we DON'T sum the counts here.  This means that the output 
    array will have duplicated pairs. IOW there will be multiple rows
    with the same two entries.  Afterwards, you will have to sum the
    duplicates to determine the co-occurence counts.
    """
    # Pre-allocation of array: WORST CASE
    p = comb(max_words_per_block, 2)
    n = int(len(context_blocks_array) * p)
    print('Worst-case pre-allocation is {:,} entries.'.format(n))
    co = np.zeros((n, 2), dtype='i4') - 1
    end = 0  # Keep track of position in the allocation array

    for block in context_blocks_array:
        # Combinations of words in this block. (m, 2) array
        new_entries = combtriu(block[block>-1])  
        # Copy the new associations directly in
        co[end:end+len(new_entries), :] = new_entries
        # Move the "current position" marker
        end += len(new_entries)
    
    # Return an array of the correct size (truncate)
    print('Actual count turned out to be {:,} entries.'.format(end+1))
    return co[:end, :]

### Performance Test

In [36]:
try:
    del co
except:
    pass

with Stats():
    co = method_numpy1(context_blocks_array)
    
associations = len(co)
print('associations: {:,}'.format(associations))

Worst-case pre-allocation is 78,000,000 entries.
Actual count turned out to be 7,557,317 entries.

Change in memory: 595.1 MB
Time cost (s)   : 1.318 s

associations: 7,557,316


### How to use the output?  Use slicing.

In [50]:
# Demo of use
def top_cooccurences(co, word, most_common_count=10):
    """ 
        co: one big array (n x 2).  Each entry is an individual co-occurence.
        word: A word that you want to find the co-occurences for.
        most_common_count: The number of most common co-occurences to return.
        
    You give a word, this function returns the 
    other words most strongly associated with
    it, along with the counts.
    """
    ix = wordsd[word]
    # Find most common pair with "capivi"
    entries_above = co[co[:,0]==ix]
    entries_below = co[co[:,1]==ix]

    single_array = np.concatenate((entries_above[:,1], entries_below[:,0]), axis=0)  
    idx, counts = np.unique(single_array, return_counts=True)
    
    other_words = [words[idx[_]] for _ in range(most_common_count)]
    return other_words, counts[:most_common_count]

In [51]:
top_cooccurences(co, 'capivi', 3)

(['abator', 'Abie', 'abortient'], array([1, 1, 1]))

To find the most common associations in the entire result, you would have to build a sparse array to count them.

**Note that the act of building the sparse array will also count duplicate entries automatically. It's doing some of our work for us basically.**

In [180]:
def make_sparse(co):
    return csr_matrix(
            (np.ones(co.shape[0], dtype='u4'), (co[:,0], co[:,1])),
            dtype='u4')

m = make_sparse(co)
m.shape

(60985, 61000)

Now we can query the top counts across the entire array quite easily.

In [59]:
# All entries with a cooccurence > 2
# The two arrays returned are indexes for each dimension.
m[m>2].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]))

If you're **only interested in rows**, you could also sum the array across the columns and see what comes up.

In [67]:
sums = m.sum(axis=1)
max_count_index = sums.argmax()
max_count_value = sums[max_count_index, 0]
print('Word with the biggest count is {} with {}.'.format(max_count_index, max_count_value))
print('(That word is {})'.format(words[max_count_index]))

Word with the biggest count is 15186 with 462.
(That word is Atlantean)


What if you want to find the top 10 **ROWS**?

In [175]:
def top_rows(m: "sparse array", count=10):
    sums = m.sum(axis=1).ravel()
    #print(sums.shape, sums)
    indices = np.argsort(sums, 1)
    #print(indices.shape, indices)
    indices = indices[0, -count:]
    #print(indices.shape, indices)
    #print(indices[0, -2])
    for ix in range(-1, -count-1, -1):
        i = indices[0, ix]
        print('{:20} : {:<}'.format(words[i], sums[0,i]))
        
# Demo
print('The top 2:')
print('==========')
top_rows(m, 2)
print()
print('The top 10:')
print('===========')
top_rows(m, 10)

The top 2:
Atlantean            : 462
Alaska               : 444

The top 10:
Atlantean            : 462
Alaska               : 444
Drepanaspis          : 440
Aghorapanthi         : 440
Acrisius             : 436
acrobatic            : 434
Cahokia              : 430
Catalan              : 429
Ciceronically        : 428
Asteroxylon          : 412


# Very large test

In [176]:
with Stats():
    new_cb = make_context_blocks(num_blocks=int(2e5), word_count=(5,40))
    
with Stats():
    new_cba = make_cb_array(new_cb)

print(len(new_cb), len(new_cba))


Change in memory: 33.63 MB
Time cost (s)   : 9.059 s


Change in memory: 152.6 MB
Time cost (s)   : 2.071 s

200000 200000


try:
    del co2
except:
    pass

with Stats():
    co2 = method_numpy1(new_cba)
    
print('Associations  : ','{0:,}'.format(len(co2)))
print('Size of result: {:,.2f} MB'.format(co2.nbytes/1024/1024))
    

### Check out the top 5 rows

In [188]:
with Stats():
    m = make_sparse(co2)
    
print('Size of sparse matrix: {:,.2f} MB'.format(m.data.nbytes/1024/1024))
print()
print('Top 5 rows (words):')
print()
    
with Stats():
    top_rows(m, 5)


Change in memory: 0 MB
Time cost (s)   : 4.389 s

Size of sparse matrix: 211.78 MB

Top 5 rows (words):

Byrsonima            : 2588
Amy                  : 2563
Anemopsis            : 2562
Agyieus              : 2543
Buyides              : 2493

Change in memory: 0 MB
Time cost (s)   : 0.2612 s



# What about `dict` but with ints and our numpy tools?

The results are pretty bad, surprisingly so.  Needs more investigation to figure out why.

In [202]:
def method_dict_int(context_blocks_array):
    """
    Given a list of blocks (each containing 5-40 words), build a dict that 
    itself contain dicts. the inner dict has a count of the number of associations
    between the outer key and the inner key.
    
    THIS ONE USES INTEGERS EVERYWHERE.
    """
    d = defaultdict(lambda: defaultdict(int))
    for block in context_blocks_array:
        for w1, w2 in combtriu(block[block>-1]):
            d[w1][w2] += 1
    return d

with Stats():
    d = method_dict_int(context_blocks_array)
    
associations = sum(sum(w2s.values()) for w1, w2s in d.items())
print('associations: ', associations)
print(len(context_blocks_array))


Change in memory: 238.1 MB
Time cost (s)   : 16.47 s

associations:  7557316
100000


# Sparse

In [203]:
import numpy as np
from scipy.misc import comb

def method_sparse(context_blocks_array, max_words_per_block=40, max_section_length=int(1e7)):
    """
    Series of sparse matrix constructions.
    
    max_section_length is a setting.  Tweak to trade-off CPU vs RAM.
    """    
    # Max combinations possible in each block
    p = comb(max_words_per_block, 2)     
    
    # Buffers 
    ones = np.ones(max_section_length, dtype='u2')
    co = np.zeros((max_section_length, 2), dtype='u2')
    end = 0  # Keep track of position in the allocation array 
    
    # The max number of unique words.  Might need to go up.
    # Sets num rows and cols for the output sparse matrix
    ns = 2**16  # (65536) 
    # Output. Stores co-occurrence totals between word pairs.
    # The datatype determines the max count possible, and also the 
    # memory cost of the sparse matrix.  'u2' is quite aggressively
    # small. u4 shouldn't be much worse.
    m = csr_matrix((ns, ns), dtype='u2')  # 
    
    for block in context_blocks_array:
        # Combinations of words in this block.
        new_entries = combtriu(block[block>-1])  
        #new_entries = combb(block[block>-1]) 
        # Copy the new associations directly in
        co[end:end+len(new_entries), :] = new_entries
        # Move the "current position" marker
        end += len(new_entries)
        # Buffer might be full
        full = end > max_section_length - p  # Account for next iteration fill-up, worst case
        if full:
            m += csr_matrix((ones[:end], (co[:end, 0], co[:end, 1])), (ns, ns))
            end = 0 # Reset back to start
    
    if end > 0:
        m += csr_matrix((ones[:end], (co[:end, 0], co[:end, 1])), (ns, ns))
    return m    
    
try:
    del m
except:
    pass

print('Length of context_blocks_array:',len(context_blocks_array))
with Stats():
    m = method_sparse(context_blocks_array[:100000], max_section_length=int(1e7))
    
print('Total co-occurences: {:,}'.format(m.sum()))

Length of context_blocks_array: 100000

Change in memory: 228.7 MB
Time cost (s)   : 1.336 s

Total co-occurences: 7,557,316


### Using the sparse array

In [205]:
with Stats():
    top_rows(m)

Atlantean            : 462
Alaska               : 444
Aghorapanthi         : 440
Drepanaspis          : 440
Acrisius             : 436
acrobatic            : 434
Cahokia              : 430
Catalan              : 429
Ciceronically        : 428
Asteroxylon          : 412

Change in memory: 26.73 MB
Time cost (s)   : 0.0465 s



## Try out the big one

In [206]:
try:
    del m
except:
    pass

print('Length of context_blocks_array:',len(new_cba))
with Stats():
    m = method_sparse(new_cba, max_section_length=int(1e6))
print('Total co-occurences: {:,}'.format(m.sum()))
print('Size of sparse matrix: {:,.2f} MB'.format(m.data.nbytes/1024/1024))

Length of context_blocks_array: 200000

Change in memory: 283.2 MB
Time cost (s)   : 12.41 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB


### Memory is great but it's a bit on the slow size.

We can increase the buffer size, reducing the number of times a sparse matrix has to be built internally.  Let's search for the optimum.

In [207]:
try:
    del m
    #sleep(0)
except:
    pass

print('Length of context_blocks_array:',len(new_cba))
for i in range(1,11):
    size = int(i*1e7)
    print('*********************')
    print('Buffer size: {:,}'.format(size))
    print('*********************')
    with Stats():
        m = method_sparse(new_cba, max_section_length=size)
    print('Total co-occurences: {:,}'.format(m.sum()))
    print('Size of sparse matrix: {:,.2f} MB'.format(m.data.nbytes/1024/1024))

Length of context_blocks_array: 200000
*********************
Buffer size: 10,000,000
*********************

Change in memory: 644.1 MB
Time cost (s)   : 6.552 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
*********************
Buffer size: 20,000,000
*********************

Change in memory: 260.3 MB
Time cost (s)   : 6.401 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
*********************
Buffer size: 30,000,000
*********************

Change in memory: 343.7 MB
Time cost (s)   : 6.302 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
*********************
Buffer size: 40,000,000
*********************

Change in memory: -0.2773 MB
Time cost (s)   : 6.904 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
*********************
Buffer size: 50,000,000
*********************

Change in memory: -376.3 MB
Time cost (s)   : 7.216 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
******************

Looks like we get our best timings with a buffer length of 3e7.

# Sparse + Cython

In [245]:
%%cython -a

cimport numpy as np
import numpy as np
from scipy.misc import comb
from itertools import chain
from functools import lru_cache
from scipy.sparse import csr_matrix

@lru_cache()
def comb_index_triu(n, k):
    return np.array(np.triu_indices(n, 1)).T
    
def combtriu(data):
    idx = comb_index_triu(len(data), 2)
    return data[idx]

def method_sparse_cy1(
            int[:, :] context_blocks_array, 
            int max_words_per_block=40, 
            int max_section_length=int(1e7)):
    """
    Series of sparse matrix constructions.
    
    max_section_length is a setting.  Tweak to trade-off CPU vs RAM.
    """    
    # Max combinations possible in each block
    cdef int p = comb(max_words_per_block, 2)     
    
    # Buffers 
    cdef np.ndarray ones = np.ones(max_section_length, dtype='u2')
    cdef np.ndarray co = np.zeros((max_section_length, 2), dtype='u2')
    cdef np.ndarray new_entries
    cdef long end = 0  # Keep track of position in the allocation array 
    
    # The max number of unique words.  Might need to go up.
    # Sets num rows and cols for the output sparse matrix
    cdef long ns = 2**16  # (65536) 
    # Output. Stores co-occurrence totals between word pairs.
    # The datatype determines the max count possible, and also the 
    # memory cost of the sparse matrix.  'u2' is quite aggressively
    # small. u4 shouldn't be much worse.
    m = csr_matrix((ns, ns), dtype='u2')  # 

    cdef int i, j, k, nn, n = context_blocks_array.shape[0]
    cdef int[:] block
    cdef int[:,:] pbuffer = np.zeros((p, 2), dtype='u2')
    
    for i in range(n):
        
        j = 0
        while j < max_words_per_block:
            if context_blocks_array[i, j] == -1:
                break 
            j += 1

        block = context_blocks_array[i, :j]
        
        for k in range()
        
        
        
        print("***")
    
    #for block in context_blocks_array:
        # Combinations of words in this block.
        new_entries = combtriu(block)  
        #new_entries = combb(block[block>-1]) 
        # Copy the new associations directly in
        co[end:end+len(new_entries), :] = new_entries
        # Move the "current position" marker
        end += len(new_entries)
        # Buffer might be full
        full = end > max_section_length - p  # Account for next iteration fill-up, worst case
        if full:
            m += csr_matrix((ones[:end], (co[:end, 0], co[:end, 1])), (ns, ns))
            end = 0 # Reset back to start
    
    if end > 0:
        m += csr_matrix((ones[:end], (co[:end, 0], co[:end, 1])), (ns, ns))
    return m    

In [246]:
try:
    del m
except:
    pass

print('Length of context_blocks_array:',len(context_blocks_array))
print(context_blocks_array.shape, context_blocks_array.dtype)
with Stats():
    #m = method_sparse_cy1(context_blocks_array, max_section_length=int(1e7))
    m = method_sparse_cy1(context_blocks_array)
    
print('Total co-occurences: {:,}'.format(m.sum()))
print('Size of sparse matrix: {:,.2f} MB'.format(m.data.nbytes/1024/1024))

Length of context_blocks_array: 100000
(100000, 100) int32
***

Change in memory: 0 MB
Time cost (s)   : 0.04149 s



TypeError: only integer arrays with one element can be converted to an index

In [226]:
try:
    del m
except:
    pass

print('Length of context_blocks_array:',len(new_cba))
with Stats():
    m = method_sparse_cy1(new_cba, max_section_length=int(1e7))
    
print('Total co-occurences: {:,}'.format(m.sum()))
print('Size of sparse matrix: {:,.2f} MB'.format(m.data.nbytes/1024/1024))

Length of context_blocks_array: 200000

Change in memory: 288.7 MB
Time cost (s)   : 6.028 s

Total co-occurences: 56,363,093
Size of sparse matrix: 105.89 MB
