In [1]:
import numpy as np
import os
import pandas as pd
import threading

from scipy.sparse import load_npz
from typing import Literal, Tuple, Union

GeographyYear = Literal[2010, 2020]

# Testing Interpolation from 2010 Blocks to 2020 Blocks

## Methods

### Helpers

Helpers are ported from [tileserve.py]("./tileserve.py") for the purposes of testing interpolation methods

In [2]:
class LRUCache:
    def __init__(self, max_entries):
        self.max_entries = max_entries
        self.entries = {}
        self.usecount = 0
    
    def has(self, key):
        return key in self.entries
    
    def get(self, key):
        self.use(key)
        return self.entries[key]['data']
    
    def use(self, key):
        self.usecount += 1
        self.entries[key]['lastuse'] = self.usecount

    def insert(self, key, val):
        self.entries[key] = {'data':val}
        self.use(key)
        if len(self.entries) > self.max_entries:
            lru_key, lru_val = None, {}
            for key, val in self.entries.items():
                if not lru_val or val['lastuse'] < lru_val['lastuse']:
                    lru_key, lru_val = key, val
            if lru_val:
                del self.entries[lru_key]

column_cache = LRUCache(128)

def memory_map(binary_file_path: str, dtype: np.dtype = np.float32) -> np.memmap:
    return np.memmap(binary_file_path, dtype=dtype, mode="r")

### Load Methods

In [3]:
def try_load_block_data_2010(dataset: str, column: str) -> Tuple[bool, Union[np.memmap, str]]:
    (cached, cache_key, filename_prefix, filename) = _try_load_from_cache(dataset, column, 2010)

    if isinstance(cached, str):
        return False, cached
    
    if cached is not None:
        return True, cached

    if not os.path.exists(filename):
        if not os.path.exists(f"{filename_prefix}.numpy"):
            return False, f"No column named {column} in dataset {dataset}."

        data = np.load(f"{filename_prefix}.numpy").astype(np.float32)
        tmp_filename = f"{filename}.tmp.{os.getpid()}.{threading.current_thread().ident}"

        data.tofile(tmp_filename)

        os.rename(tmp_filename, filename)

    data = memory_map(filename)
    column_cache.insert(cache_key, data)

    return True, data

def try_load_block_data_2020(dataset: str, column: str) -> Tuple[bool, Union[np.memmap, str]]:
    (cached, cache_key, _, filename) = _try_load_from_cache(dataset, column, 2020)

    if isinstance(cached, str):
        return False, cached
    
    if cached is not None:
        return True, cached

    if not os.path.exists(filename):
        (success, data_2010) = try_load_block_data_2010(dataset, column)

        if not success:
            return success, data_2010

        crosswalk_matrix = load_npz("./crosswalk_matrix_2010_2020.npz")
        data_2020 = crosswalk_matrix.dot(data_2010)
        tmp_filename = f"{filename}.2020.tmp.{os.getpid()}.{threading.current_thread().ident}"

        data_2020.tofile(tmp_filename)

        os.rename(tmp_filename, filename)

    data = memory_map(filename)
    column_cache.insert(cache_key, data)
    return True, data

def _try_load_from_cache(dataset: str, column: str, year: GeographyYear = 2010) -> Tuple[Union[None, np.memmap, str], str, str, str]:
    cache_key = f'{dataset}.{year}.{column}'
    
    if column_cache.has(cache_key):
        return column_cache.get(cache_key), "", "", ""
    
    cache_dir = "columncache"
    dataset_dir = f"{cache_dir}/{dataset}"

    if not os.path.exists(dataset_dir):
        return f"No such dataset: {dataset}", "", "", ""

    filename_prefix = f"{dataset_dir}/{column}"
    filename = f"{filename_prefix}{'.2020' if year == 2020 else ''}.float32"

    return [None, cache_key, filename_prefix, filename]

In [4]:
full_population_count_1990 = ("census1990_block2010", "P0010001")
full_population_count_2000 = ("census2000_block2010", "P0010001")
full_population_count_2010 = ("census2010_block2010", "P0010001")

In [5]:
full_population_count_1990_2010 = try_load_block_data_2010(*full_population_count_1990)
full_population_count_2000_2010 = try_load_block_data_2010(*full_population_count_2000)
full_population_count_2010_2010 = try_load_block_data_2010(*full_population_count_2010)

In [6]:
full_population_count_1990_2010[0], full_population_count_1990_2010[1] if not full_population_count_1990_2010[0] else full_population_count_1990_2010[1].shape

(True, (11078298,))

In [7]:
full_population_count_2000_2010[0], full_population_count_2000_2010[1] if not full_population_count_2000_2010[0] else full_population_count_2000_2010[1].shape

(True, (11078298,))

In [8]:
full_population_count_2010_2010[0], full_population_count_2010_2010[1] if not full_population_count_2010_2010[0] else full_population_count_2010_2010[1].shape

(True, (11078298,))

In [9]:
full_population_count_1990_2020 = try_load_block_data_2020(*full_population_count_1990)
full_population_count_2000_2020 = try_load_block_data_2020(*full_population_count_2000)
full_population_count_2010_2020 = try_load_block_data_2020(*full_population_count_2010)

In [10]:
full_population_count_1990_2020[0], full_population_count_1990_2020[1] if not full_population_count_1990_2020[0] else full_population_count_1990_2020[1].shape

(True, (8174956,))

In [11]:
full_population_count_2000_2020[0], full_population_count_2000_2020[1] if not full_population_count_2000_2020[0] else full_population_count_2000_2020[1].shape

(True, (8174956,))

In [12]:
full_population_count_2010_2020[0], full_population_count_2010_2020[1] if not full_population_count_2010_2020[0] else full_population_count_2010_2020[1].shape

(True, (8174956,))

In [16]:
full_population_count_2010_2010[1][:50]

memmap([  0.,  61.,   0.,   0.,  75.,   0.,   1.,   0.,  23.,   0.,   1.,
          0.,   0.,   2.,   2.,  70.,  56.,  26.,   0.,  17., 171.,   0.,
          0.,   0.,   9.,   0., 152.,   3.,  29.,   0.,   6.,   0.,   0.,
          8.,  65., 180.,   2.,   4.,   0.,   5.,   6.,  35.,   0.,   1.,
         20.,  80., 203.,   0.,   0.,  34.], dtype=float32)

In [17]:
full_population_count_2010_2020[1][:50]

memmap([0.0000000e+00, 2.1607796e+01, 3.9392204e+01, 3.4178200e+01,
        2.2912872e+01, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 2.2000411e+00, 4.0967807e-02, 1.7589912e+00,
        0.0000000e+00, 1.6941471e+01, 0.0000000e+00, 1.7100000e+02,
        4.1967457e+01, 0.0000000e+00, 7.0000000e+01, 5.6000000e+01,
        2.6000000e+01, 3.0000000e+00, 1.5200000e+02, 2.9000000e+01,
        0.0000000e+00, 9.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        6.0000000e+00, 0.0000000e+00, 0.0000000e+00, 2.4518759e+02,
        2.8000000e+01, 3.6000000e+01, 1.0000000e+02, 8.0000000e+00,
        2.8100000e+02, 1.5000000e+01, 3.4000000e+01, 2.9000000e+01,
        3.9000000e+01, 2.0000000e+01, 1.0000000e+00, 1.8124110e+00,
        0.0000000e+00, 0.0000000e+00, 4.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 1.1000000e+01], dtype=float32)