In [1]:
from sys import version

print(version)
print("HEY")

3.8.2 (default, Mar 11 2020, 00:28:52) 
[Clang 11.0.0 (clang-1100.0.33.17)]
HEY


In [2]:
from app import AppContext, Word as WordModel
from core import LoadConfig
import numpy as np

from time import time

from time_series import TimeSeries

In [10]:
ctx = AppContext(LoadConfig("configs/main.yaml"))


async def _download_slice(word: str, from_: int, to_: int):
    from_ = TimeSeries._normalize(from_) or 0
    to_ = TimeSeries._normalize(to_) or float("+inf")

    async for word in ctx.db.find(WordModel(word=word)):
        if to_ >= word.date >= from_:
            yield word


async def download_word(alpha, from_: int, to_: int):
    items = []
    async for word in _download_slice(alpha, from_, to_):
        items.append(word.date)

    word_ts = np.array(items)
    return TimeSeries(alpha, word_ts)


async def _download(word: str, from_: int, to_: int):
    word_p = []
    async for word in _download_slice(word, from_, to_):
        word_p.append((word.date, word.from_id, word.post_id))

    return word_p

async def download_pair(alpha, beta, from_: int, to_: int):
    word_alpha_p = await _download(alpha, from_, to_)
    word_beta_p = await _download(beta, from_, to_)

    # common = set(word_alpha_p).intersection(word_beta_p)

    common = [item for item in word_alpha_p if item in word_beta_p]

    DATE_INDEX = 0
    word_ts = np.array(tuple(word_info[DATE_INDEX] for word_info in common))
    return TimeSeries(f"{alpha}&{beta}", word_ts)

[2020-04-18 16:53:10.160345] [  INFO   [0m] /[31m55384[0m/ LoadConfig [90mcore/config/__init__.py:13 LoadConfig[0m: Load config source=configs/main.yaml


In [11]:
async def word_counter():
    async for item in ctx.db.db.Word.aggregate([
       { "$match": {}},
       { "$group": {
           "_id": "$word",
           "count": { "$sum": 1 }
       }},
       {'$sort': {'count': -1}}
    ]):
        yield item['_id'], item['count']


In [12]:
if False:
    import pymongo

    await ctx.db.db.Word.create_index([('word', pymongo.HASHED)])
    await ctx.db.db.Word.create_index([('date', pymongo.ASCENDING)])


In [13]:
async def do_dist_search(source_word: str,
                         min_items: int,
                         from_, to_, period,
                         check_info):
    dists = {}

    word = await download_word(source_word, from_, to_)
    async for name, count in word_counter():
        if count < min_items:
            if check_info(name):
                print("skipped by count", name, count)
            continue

        candidate = await download_word(name, from_, to_)

        if len(candidate) < min_items:
            if check_info(name):
                print("skipped by slice", name, count, candidate)

            del candidate
            continue

        # Calculate core

        core = await download_pair(source_word, name, from_, to_)

        word_grid = word[from_:to_:period]
        candidate_grid = candidate[from_:to_:period]
        core_grid = core[from_:to_:period]

        print("Successfully added!", name, count, candidate)

        dists[name] = {
            'dist_words': word_grid.dist(candidate_grid),
            'dist_source_core': word_grid.dist(core_grid),
            'dist_candidate_core': candidate_grid.dist(core_grid)
            }
        print(name, dists[name])

    return dists

In [21]:
import datetime
from_ = datetime.datetime(2019, 12, 1)
to_ = datetime.datetime(2020, 2, 1)
grid_period = datetime.timedelta(days=1)

dists = await do_dist_search("новый", 100, from_, to_, grid_period, lambda name: name == "год")

Successfully added! весь 39085 <TimeSeries of `весь` [2019-11-30T18:07:04Z - 2020-01-31T15:45:01Z] 1741 items, ∑=2582.00>
весь {'dist_words': 223.6023255693017, 'dist_source_core': 262.35472170326955, 'dist_candidate_core': 239.06484475974295}
Successfully added! все 35295 <TimeSeries of `все` [2019-11-30T20:13:34Z - 2020-01-31T16:47:16Z] 1505 items, ∑=2208.00>
все {'dist_words': 209.6854787532985, 'dist_source_core': 284.31144894288025, 'dist_candidate_core': 210.24509506763766}
Successfully added! день 34080 <TimeSeries of `день` [2019-11-30T17:42:42Z - 2020-01-31T15:47:20Z] 1029 items, ∑=1340.00>
день {'dist_words': 316.83749778080244, 'dist_source_core': 353.4572109888268, 'dist_candidate_core': 127.51470503436065}
Successfully added! мой 30660 <TimeSeries of `мой` [2019-12-01T01:19:28Z - 2020-01-31T16:47:16Z] 1290 items, ∑=1951.00>
мой {'dist_words': 211.2889017435606, 'dist_source_core': 324.54737712697664, 'dist_candidate_core': 207.27276714513172}
Successfully added! который 30

{'весь': {'dist_words': 223.6023255693017,
  'dist_source_core': 262.35472170326955,
  'dist_candidate_core': 239.06484475974295},
 'все': {'dist_words': 209.6854787532985,
  'dist_source_core': 284.31144894288025,
  'dist_candidate_core': 210.24509506763766},
 'день': {'dist_words': 316.83749778080244,
  'dist_source_core': 353.4572109888268,
  'dist_candidate_core': 127.51470503436065},
 'мой': {'dist_words': 211.2889017435606,
  'dist_source_core': 324.54737712697664,
  'dist_candidate_core': 207.27276714513172},
 'который': {'dist_words': 272.95604041676745,
  'dist_source_core': 313.5554177493988,
  'dist_candidate_core': 230.69026854204319},
 'свой': {'dist_words': 286.8135282722905,
  'dist_source_core': 325.45660232971153,
  'dist_candidate_core': 228.4162866347319},
 'человек': {'dist_words': 302.25816779700097,
  'dist_source_core': 324.73373708316785,
  'dist_candidate_core': 243.71294590152573},
 'так': {'dist_words': 290.09136491802025,
  'dist_source_core': 328.3823990411

In [22]:
dists = Out[21]

In [23]:
for key, item in dists.items():
    dists[key]['full'] = np.sum(np.array(tuple(dists[key].values())) ** 2) ** 0.5

In [27]:
_dists = sorted(((n, v['dist_source_core']) for n, v in dists.items()), key=lambda x:x[1])
_dists

[('новый', 0.0),
 ('год', 86.55056325639943),
 ('весь', 262.35472170326955),
 ('все', 284.31144894288025),
 ('друг', 308.88023569014575),
 ('много', 311.6167517961767),
 ('который', 313.5554177493988),
 ('себя', 322.7708165246666),
 ('мой', 324.54737712697664),
 ('человек', 324.73373708316785),
 ('свой', 325.45660232971153),
 ('так', 328.38239904111794),
 ('очень', 332.76117561999325),
 ('тот', 336.87089515124336),
 ('жизнь', 340.02352859765455),
 ('только', 340.79172525165575),
 ('становиться', 342.85565475867537),
 ('время', 345.44029874929186),
 ('самый', 346.5198407017988),
 ('хороший', 347.522661131616),
 ('каждый', 348.9183285526858),
 ('кто', 349.38803642941184),
 ('мочь', 351.4896869041822),
 ('один', 351.83945202322036),
 ('такой', 351.88492437159056),
 ('хотеть', 352.797392280612),
 ('если', 353.05948507298314),
 ('день', 353.4572109888268),
 ('вот', 356.27236771885634),
 ('просто', 356.6763238567988),
 ('чтобы', 357.1890255872932),
 ('уже', 357.53461370893865),
 ('другой', 3