# Основа модели

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from scipy import sparse
from pyaspeller import YandexSpeller
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import tqdm
from scipy.sparse.linalg import spsolve
from scipy import spatial
from operator import itemgetter

## Загрузка и предобработка данных

In [17]:
search_history = pd.read_csv('search_history.csv')
print(len(search_history['wbuser_id'].unique()))
search_history = search_history.drop(search_history[search_history.cnt == 0].index)
search_history.dropna(inplace = True)
grouped = search_history[['wbuser_id', 'UQ']].groupby('wbuser_id', as_index=False).agg(' '.join) 

8172206


In [18]:
search_history

Unnamed: 0,wbuser_id,UQ,cnt,locale,weekday,time
0,37bc0ce12ffabce1b1882e66d461ed0e,тапочки женские домашние,1933,Ru,0,10:48:53
1,4636a6706e6736d818816d8657565aa2,чехол для бейджика,1513,Ru,0,10:48:53
2,708f4040baf99acfc9496563edff1b1a,GUESS,4,Ru,0,10:48:53
4,70311ec9008a31f743c164e6f1198c86,фототфон,92272,Ru,0,10:48:53
5,3d5e0b035ee04de0801692081278ef1a,7024,93,Ru,0,10:48:53
...,...,...,...,...,...,...
83919232,8b939f9c7b2c24003477d4408ce908fa,термос для еды,6996,Ru,6,23:30:40
83919233,70311ec9008a31f743c164e6f1198c86,фаллоимитатор вибратор,3288,Ru,6,23:30:40
83919235,c0d75daa762272829028e43a62ff7c75,весы кухонные электронные,1407,Ru,6,23:30:40
83919236,70311ec9008a31f743c164e6f1198c86,термобелье мужское,3233,Ru,6,23:30:40


In [None]:
fixed = []
for i in grouped['UQ']:
    tmp = []
    for j in i.split(' '):
        if len(j) > 1:
            tmp.append(j)
    tmp = list(set(tmp))
    fixed.append(' '.join(tmp))
grouped['fixed'] = fixed

In [38]:
query_popularity = pd.read_csv('query_popularity.csv')
query_popularity.dropna(inplace = True)

In [39]:
queries_initial = query_popularity['query']
dictionary = set()
queries = []
for query in queries_initial:
    for word in query.split(' '):
        if len(word)>1:
            dictionary.add(word)
print(len(dictionary))
dictionary = list(dictionary)

129358


In [41]:
vectorizer = CountVectorizer(lowercase = False)
vectorizer.fit_transform(dictionary)
for i in (set.difference(set(dictionary),set(vectorizer.get_feature_names()))):
    dictionary.remove(i)
print(len(dictionary))

110444


In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
ans = []
for i in range(110):
    encoded_input = tokenizer(dictionary[i*1000:(i+1)*1000], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = torch.squeeze(model(**encoded_input)[1])
    ans.append(model_output)
encoded_input = tokenizer(dictionary[110000:], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = torch.squeeze(model(**encoded_input)[1])
ans.append(model_output)   
    
queries = torch.cat(ans)

## Код модели и ее инициализация, работает долго

In [None]:
class TrieNode:
    __slots__ = ('value', 'end_of_word', 'children', 'weight')

    def __init__(self, value: str, end_of_word=False):
        self.value = value
        self.end_of_word = end_of_word
        self.children = {}
        self.weight = -1

    def add(self, word_part: str, *, weight: int=-1) -> None:
        if len(word_part) == 0:
            self.end_of_word = True
            self.weight = weight
            return

        first_char = word_part[0]
        node = self.children.setdefault(first_char, TrieNode(first_char))
        node.add(word_part[1:], weight=weight)

    def find_all(self, word_part: str, path: str=""):
        if self.end_of_word:
            yield path + self.value, self.weight

        if len(word_part) > 0:
            char = word_part[0]
            node = self.children.get(char)

            if node is not None:
                yield from node.find_all(word_part[1:], path + self.value)
        else:
            for node in self.children.values():
                yield from node.find_all("", path + self.value)
                
    def autocomplete(self, string):
        split_words = string.split()
        last_word = split_words[-1]
        prefix = ' '.join(split_words[:-1])

        suggestions = self.find_all(last_word)

        full_suggestions = []

        for suggestion in suggestions:
            full_suggestions.append((
                '{}{}'.format((prefix + ' ') if prefix else '', suggestion[0]),
                suggestion[1],
            ))

        sorted_suggestions = sorted(
            full_suggestions,
            key=itemgetter(1),
            reverse=True,
        )

        return {
            'words': list(map(itemgetter(0), sorted_suggestions))
        }

root = TrieNode("")

for word in tqdm.tqdm(query_popularity['query']):
    root.add(word.rstrip('\n'), weight=1)

In [None]:
class ALS():
    def __init__(self,users, queries, dictionary, trie):
        
        #self.queries = queries
        #self.cos = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
        self.trie = trie
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
        self.model = AutoModel.from_pretrained("cointegrated/rubert-tiny")  
        self.users = users
        self.dictionary = dictionary
        self.FEATURE_NUM = 15
        self.svd = TruncatedSVD(n_components=15, n_iter=7, random_state=42)
        self.queries = self.svd.fit_transform(queries)
        self.vectorizer = CountVectorizer(lowercase = False)
        self.encoded_dict = self.vectorizer.fit_transform(self.dictionary)
        self.interaction_matrix = self.vectorizer.transform(self.users['fixed'])
        self.interaction_matrix[self.interaction_matrix>0] = 1
        self.users_interest = sparse.random(len(self.users),self.queries.shape[1], density = 0.1)
    
    def implicit_als(self, a=40, it=10, l=0.1):
        conf = self.interaction_matrix * a
        u_s, i_s = self.interaction_matrix.shape
        self.X = sparse.csr_matrix(np.random.normal(size=(u_s, self.FEATURE_NUM)))
        self.Y = np.zeros((i_s, self.FEATURE_NUM)) 
        for i in range(self.Y.shape[0]):
            self.Y[i] = self.queries[i]
        self.Y = sparse.csr_matrix(np.nan_to_num(self.Y))
        Y_I = sparse.eye(i_s)
        I = sparse.eye(self.FEATURE_NUM)
        lI = l * I
        yTy = self.Y.T.dot(self.Y)
        for u in tqdm.trange(u_s):
            u_row = conf[u, :].toarray()
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I
            yT_CuI_y = self.Y.T.dot(CuI).dot(self.Y)
            yT_Cu_pu = self.Y.T.dot(Cu).dot(p_u.T)
            for ip in range(it):
                self.X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)
        return
    def predict(self, prefix, idx):
        self.current_user = self.X[idx]
        lenter = len(prefix.split(' '))
        results = set([x.split(' ')[lenter - 1] for x in self.trie.autocomplete(prefix)['words']])
        self.bert_interpretation = self.tokenizer(list(results), padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = torch.squeeze(self.model(**self.bert_interpretation)[1])
        self.bert_interpretation = self.svd.transform(model_output)
        distance = {}
        for vec, dec in zip(self.bert_interpretation,results) :
            distance[dec] = spatial.distance.cosine(vec, self.current_user.toarray())
        sort_sug = sorted(distance.items(), key=lambda x: x[1])[:10]
        return [x[0] for x in sort_sug]

In [None]:
als = ALS(grouped[:10000],queries,dictionary, root)

## Обучение модельки... вообще быстро, но если увеличивать it, то долговато

In [None]:
als.implicit_als(it=25)

## Сохранение всей модели

In [None]:
import joblib
joblib.dump(als, 'wildals_10000.jbl')

## Загрузка модели и инференс, Тестирование

In [None]:
import joblib
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from scipy import sparse
from pyaspeller import YandexSpeller
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import tqdm
from scipy.sparse.linalg import spsolve
from scipy import spatial
from operator import itemgetter

class TrieNode:
    __slots__ = ('value', 'end_of_word', 'children', 'weight')

    def __init__(self, value: str, end_of_word=False):
        self.value = value
        self.end_of_word = end_of_word
        self.children = {}
        self.weight = -1

    def add(self, word_part: str, *, weight: int=-1) -> None:
        if len(word_part) == 0:
            self.end_of_word = True
            self.weight = weight
            return

        first_char = word_part[0]
        node = self.children.setdefault(first_char, TrieNode(first_char))
        node.add(word_part[1:], weight=weight)

    def find_all(self, word_part: str, path: str=""):
        if self.end_of_word:
            yield path + self.value, self.weight

        if len(word_part) > 0:
            char = word_part[0]
            node = self.children.get(char)

            if node is not None:
                yield from node.find_all(word_part[1:], path + self.value)
        else:
            for node in self.children.values():
                yield from node.find_all("", path + self.value)
                
    def autocomplete(self, string):
        split_words = string.split()
        last_word = split_words[-1]
        prefix = ' '.join(split_words[:-1])

        suggestions = self.find_all(last_word)

        full_suggestions = []

        for suggestion in suggestions:
            full_suggestions.append((
                '{}{}'.format((prefix + ' ') if prefix else '', suggestion[0]),
                suggestion[1],
            ))

        sorted_suggestions = sorted(
            full_suggestions,
            key=itemgetter(1),
            reverse=True,
        )

        return {
            'words': list(map(itemgetter(0), sorted_suggestions))
        }

class ALS():
    def __init__(self,users, queries, dictionary, trie):
        
        #self.queries = queries
        #self.cos = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
        self.trie = trie
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
        self.model = AutoModel.from_pretrained("cointegrated/rubert-tiny")  
        self.users = users
        self.dictionary = dictionary
        self.FEATURE_NUM = 15
        self.svd = TruncatedSVD(n_components=15, n_iter=7, random_state=42)
        self.queries = self.svd.fit_transform(queries)
        self.vectorizer = CountVectorizer(lowercase = False)
        self.encoded_dict = self.vectorizer.fit_transform(self.dictionary)
        self.interaction_matrix = self.vectorizer.transform(self.users['fixed'])
        self.interaction_matrix[self.interaction_matrix>0] = 1
        self.users_interest = sparse.random(len(self.users),self.queries.shape[1], density = 0.1)
    
    def implicit_als(self, a=40, it=10, l=0.1):
        conf = self.interaction_matrix * a
        u_s, i_s = self.interaction_matrix.shape
        self.X = sparse.csr_matrix(np.random.normal(size=(u_s, self.FEATURE_NUM)))
        self.Y = np.zeros((i_s, self.FEATURE_NUM)) 
        for i in range(self.Y.shape[0]):
            self.Y[i] = self.queries[i]
        self.Y = sparse.csr_matrix(np.nan_to_num(self.Y))
        Y_I = sparse.eye(i_s)
        I = sparse.eye(self.FEATURE_NUM)
        lI = l * I
        yTy = self.Y.T.dot(self.Y)
        for u in tqdm.trange(u_s):
            u_row = conf[u, :].toarray()
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I
            yT_CuI_y = self.Y.T.dot(CuI).dot(self.Y)
            yT_Cu_pu = self.Y.T.dot(Cu).dot(p_u.T)
            for ip in range(it):
                self.X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)
        return
    def predict(self, prefix, idx):
        self.current_user = self.X[idx]
        lenter = len(prefix.split(' '))
        results = set([x.split(' ')[lenter - 1] for x in self.trie.autocomplete(prefix)['words']])
        self.bert_interpretation = self.tokenizer(list(results), padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = torch.squeeze(self.model(**self.bert_interpretation)[1])
        self.bert_interpretation = self.svd.transform(model_output)
        distance = {}
        for vec, dec in zip(self.bert_interpretation,results) :
            distance[dec] = spatial.distance.cosine(vec, self.current_user.toarray())
        sort_sug = sorted(distance.items(), key=lambda x: x[1])[:10]
        return [x[0] for x in sort_sug]


als = joblib.load('wildals_10000.jbl')

In [71]:
%%time
prefix = 'сап'
iid = 125

als.predict(prefix, iid)

Wall time: 24 ms


['сапфир',
 'сапфирин',
 'сапропель',
 'сапковский',
 'сапгир',
 'сапольски',
 'сапсан',
 'сапоги',
 'сапоки',
 'сапожки']

In [72]:
%%time
prefix = 'сап'
iid = 12
print(als.users.loc[iid].wbuser_id)
als.predict(prefix, iid)

00001d2da396e9514a77ccfec8182cb0
Wall time: 22 ms


['сапоки',
 'сапольски',
 'сапоги',
 'сапфир',
 'сапои',
 'сапрги',
 'сапожки',
 'сапфирин',
 'с',
 'сапаги']

In [68]:
search_history[search_history.wbuser_id=='00001d2da396e9514a77ccfec8182cb0']

Unnamed: 0,wbuser_id,UQ,cnt,locale,weekday,time
21524176,00001d2da396e9514a77ccfec8182cb0,сникерсы женские,625,Ru,1,19:38:39
21692711,00001d2da396e9514a77ccfec8182cb0,сникерсы женские,625,Ru,1,19:40:37
21700937,00001d2da396e9514a77ccfec8182cb0,сникерсы женские,625,Ru,1,19:40:42
21904578,00001d2da396e9514a77ccfec8182cb0,сникерсы женские,625,Ru,1,19:42:55
22024524,00001d2da396e9514a77ccfec8182cb0,сникерсы женские,625,Ru,1,19:44:09
55323944,00001d2da396e9514a77ccfec8182cb0,ни сы книга,63,Ru,4,14:37:08
77075043,00001d2da396e9514a77ccfec8182cb0,костюм брюки и рубашка,10092,Ru,6,16:39:16
81946063,00001d2da396e9514a77ccfec8182cb0,костюм брюки и рубашка,10092,Ru,6,18:52:51
