In [1]:
import pandas as pd
import numpy as np
import autocomplete
from fast_autocomplete import AutoComplete
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from scipy import sparse
from pyaspeller import YandexSpeller
from spellchecker import SpellChecker
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import tqdm
from scipy.sparse.linalg import spsolve

In [2]:
search_history = pd.read_csv('search_history.csv')
print(len(search_history['wbuser_id'].unique()))
search_history = search_history.drop(search_history[search_history.cnt == 0].index)
search_history.dropna(inplace = True)
grouped = search_history[['wbuser_id', 'UQ']].groupby('wbuser_id', as_index=False).agg(' '.join) 

8172206


In [3]:
fixed = []
for i in grouped['UQ']:
    tmp = []
    for j in i.split(' '):
        if len(j) > 1:
            tmp.append(j)
    tmp = list(set(tmp))
    fixed.append(' '.join(tmp))
grouped['fixed'] = fixed

In [4]:
query_popularity = pd.read_csv('query_popularity.csv')
query_popularity.dropna(inplace = True)

In [5]:
queries_initial = query_popularity['query']
dictionary = set()
queries = []
for query in queries_initial:
    for word in query.split(' '):
        if len(word)>1:
            dictionary.add(word)
print(len(dictionary))
dictionary = list(dictionary)

129358


In [6]:
vectorizer = CountVectorizer(lowercase = False)
vectorizer.fit_transform(dictionary)
for i in (set.difference(set(dictionary),set(vectorizer.get_feature_names_out()))):
    dictionary.remove(i)
print(len(dictionary))

110444


In [7]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
ans = []
for i in range(110):
    encoded_input = tokenizer(dictionary[i*1000:(i+1)*1000], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = torch.squeeze(model(**encoded_input)[1])
    ans.append(model_output)
encoded_input = tokenizer(dictionary[110000:], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = torch.squeeze(model(**encoded_input)[1])
ans.append(model_output)   
    
queries = torch.cat(ans)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from operator import itemgetter

import hug
import tqdm


class TrieNode:
    __slots__ = ('value', 'end_of_word', 'children', 'weight')

    def __init__(self, value: str, end_of_word=False):
        self.value = value
        self.end_of_word = end_of_word
        self.children = {}
        self.weight = -1

    def add(self, word_part: str, *, weight: int=-1) -> None:
        if len(word_part) == 0:
            self.end_of_word = True
            self.weight = weight
            return

        first_char = word_part[0]
        node = self.children.setdefault(first_char, TrieNode(first_char))
        node.add(word_part[1:], weight=weight)

    def find_all(self, word_part: str, path: str=""):
        if self.end_of_word:
            yield path + self.value, self.weight

        if len(word_part) > 0:
            char = word_part[0]
            node = self.children.get(char)

            if node is not None:
                yield from node.find_all(word_part[1:], path + self.value)
        else:
            for node in self.children.values():
                yield from node.find_all("", path + self.value)


with open('words.txt',encoding="utf-8") as f:
    words = f.readlines()

root = TrieNode("")

print('Loading words')
for word in tqdm.tqdm(query_popularity['query']):
    root.add(word.rstrip('\n'), weight=1)

del words


@hug.get('/autocomplete')
def autocomplete(string: str, hug_timer = 0):
    split_words = string.split()
    last_word = split_words[-1]
    prefix = ' '.join(split_words[:-1])

    suggestions = root.find_all(last_word)

    full_suggestions = []

    for suggestion in suggestions:
        full_suggestions.append((
            '{}{}'.format((prefix + ' ') if prefix else '', suggestion[0]),
            suggestion[1],
        ))

    sorted_suggestions = sorted(
        full_suggestions,
        key=itemgetter(1),
        reverse=True,
    )

    return {
        'words': list(map(itemgetter(0), sorted_suggestions)),
        'time_taken': hug_timer,
    }

Loading words


100%|███████████████████████████████████████████████████████████████████████| 336987/336987 [00:18<00:00, 18300.89it/s]


In [None]:
autocomplete('тап')

In [15]:
class ALS():
    def __init__(self,users, queries, dictionary):
        
        #self.queries = queries
        self.users = users
        self.dictionary = dictionary
        svd = TruncatedSVD(n_components=15, n_iter=7, random_state=42)
        self.queries = svd.fit_transform(queries)
        self.vectorizer = CountVectorizer(lowercase = False)
        self.encoded_dict = self.vectorizer.fit_transform(self.dictionary)
        self.interaction_matrix = self.vectorizer.transform(self.users['fixed'])
        self.interaction_matrix[self.interaction_matrix>0] = 1
        self.users_interest = sparse.random(len(self.users),self.queries.shape[1], density = 0.1)
    
    def implicit_als(self, a=40, it=10, l=0.1):
        conf = self.users_interest.toarray() * a #+ sparse.csr_matrix(np.ones(self.users_interest.shape))
        u_s, i_s = self.users_interest.shape
        self.X = self.users_interest
        self.Y = self.queries
        Y_I = sparse.eye(i_s)
        I = sparse.eye(self.queries.shape[1])
        lI = l * I
        yTy = self.Y.T.dot(self.Y)

        for ip in range(it):
            for u in tqdm.tqdm(range(u_s)):
                p_u_el = conf[u, :].copy()
                p_u_el[p_u_el != 0] = 1.0
                CuI = sparse.diags(conf[u, :], 0)
                Cu = CuI + Y_I
                yT_CuI_y=sparse.csr_matrix(self.Y).dot(CuI).dot(sparse.csr_matrix(self.Y.T))
                yT_Cu_pu=sparse.csr_matrix(self.Y).dot(Cu).dot(sparse.csr_matrix(p_u_el).T)
                print('huy')
                self.X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)
        return

            

In [16]:
als = ALS(grouped[0:10000],queries[:1000,:],dictionary[:1000])

In [17]:
als.implicit_als()

  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

huy





ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (15,15) and requested shape (1000,1000)