In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.extend(
    [
        'src/poultry',
    ]
)

In [274]:
import string
from itertools import chain
from collections import deque

import numpy as np
from scipy import spatial

In [224]:
tweet = 'a some text! я'

In [225]:
class Vectorizer:

    def __init__(self, features=string.ascii_lowercase + string.digits, length=3):
        self.all_features = set(features)
        self.feature_map = {f: i for i, f in enumerate(features, start=1)}
        self.length = length

    def features(self, text):
        text = text.lower()
    
        features = filter(None, (self.feature_map.get(f) for f in text))
        features = chain(features, [0] * (self.length - 1))

        window = deque([0] * (self.length - 1), self.length)
        for current in features:
            window.append(current)
            yield sum(f * len(self.all_features) ** i for i, f in enumerate(reversed(window)))
            
    def __call__(self, text):
        return np.fromiter(self.features(text), int)

In [226]:
vectorizer = Vectorizer()

In [227]:
vectorizer.feature_map[tweet[0]]

1

In [229]:
# vectorizer.feature_map[tweet[0]], vectorizer.feature_map[tweet[1]]

In [230]:
vectorizer(tweet)

array([    1,    55,  1995, 25177, 19913, 17048,  7205, 26124,  7364,
       31824, 25920])

In [369]:
class Collection:
    
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer
        self.df = np.zeros(len(vectorizer.feature_map) ** vectorizer.length)
        
    
    def append(self, text):        
        features = self.vectorizer(text)
        self.df[features] += 1

        return self._idf(features)

        
    def __getitem__(self, key):
        features = self.vectorizer(key)
        return self._idf(features)

    def _idf(self, features):
#         seen_features = features[np.where(self.df[features] > 0)]
        result = np.zeros(self.df.shape[0])
        result[features] = 1 / self.df[features]
        return result
        


In [370]:
collection = Collection(vectorizer)

In [371]:
collection.append('a tweet')

array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [372]:
collection.append('another tweet')

array([ 0. ,  0.5,  0. , ...,  0. ,  0. ,  0. ])

In [373]:
collection['a tweet']

array([ 0. ,  0.5,  0. , ...,  0. ,  0. ,  0. ])

In [374]:
spatial.distance.cosine(collection['a tweet'], collection['another tweet'])

0.73986700914276393

In [377]:
collection.append('some other tweet')

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [379]:
(
    spatial.distance.cosine(collection['a tweet'], collection['another tweet']),
    spatial.distance.cosine(collection['a tweet'], collection['some other tweet']),
)

(0.78610451518429436, 0.88128211556804148)

In [382]:
collection.append('totally different piece')

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [384]:
(
    spatial.distance.cosine(collection['a tweet'], collection['another tweet']),
    spatial.distance.cosine(collection['a tweet'], collection['some other tweet']),
    spatial.distance.cosine(collection['a tweet'], collection['totally different piece']),
)

(0.78610451518429436, 0.88128211556804148, 1.0)

In [334]:
another_tweet

array([ 0. ,  0.5,  0. , ...,  0. ,  0. ,  0. ])

In [195]:
np.argwhere(collection.df).shape

(18, 1)