## Description
This feature is based on Transformers and the final interaction with it is as follows:
- The text is fed to the input
- Text preprocessing takes place
- Text tokenization
- Inference using Transformer
- Calculating cosine distances to nearest (in latent space) game descriptions / Finding nearest neighbors
- Sorting and displaying the top of recommended games matching the user's description

In [299]:
import sqlite3
import re
import html

import pandas as pd
import numpy as np
import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [256]:
class CFG:
    PATH_TO_DB = './data/board_games.sqlite'
    PATH_TO_EMBEDDINGS = './data/boardgames_embeddings.pickle'

In [2]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
model = AutoModel.from_pretrained("albert-base-v2")

Downloading pytorch_model.bin: 100%|███████| 47.4M/47.4M [00:16<00:00, 2.89MB/s]
Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
tokenizer.all_special_tokens, tokenizer.all_special_ids

(['[CLS]', '[SEP]', '<unk>', '<pad>', '[MASK]'], [2, 3, 1, 0, 4])

In [184]:
def clean_text(text: str) -> str:
    text = re.sub(r'<[^\n\r<>]+>', ' ', text)  # remove tags
    text = re.sub(r'(http[s]?://)?[\w\.]+\.[a-z]{2,3}[\S]+', ' ', text)  # remove links
    text = html.unescape(text)  # convert html symbols
    return re.sub(r'[\s]+', ' ', text)

def get_text_embedding(text, tokenizer, model):
    encoded_input = tokenizer(clean_text(text), return_tensors='pt', truncation=True)
    output = model(**encoded_input)
    return output.last_hidden_state[0][0].tolist()

clean_text("It's&quot; really</br>     great")

'It\'s" really great'

In [394]:
conn = sqlite3.connect(CFG.PATH_TO_DB)
cursor = conn.cursor()
category = 'Expansion for Base-game'
cursor.execute('''
                SELECT category_id, category
                FROM categories
                ''')
categories = cursor.fetchall()
conn.close()
categories[5]

(6, 'Expansion for Base-game')

In [231]:
category_id = 6, 28, 30, 47, 58

conn = sqlite3.connect(CFG.PATH_TO_DB)
cursor = conn.cursor()

cursor.execute('''
                SELECT d.boardgame_id, d.description
                FROM    (SELECT DISTINCT boardgame_id
                         FROM boardgame_categories
                         WHERE (category_id <> 6)) t
                LEFT JOIN descriptions d ON d.boardgame_id = t.boardgame_id
                ''')
data = cursor.fetchall()
conn.close()
len(data)

130973

In [252]:
%%time
embeddings = {}
for game in data:
    embeddings[game[0]] = get_text_embedding(clean_text(game[1]), tokenizer, model)
# pd.DataFrame(embeddings).T.reset_index(names='boardgame_id').to_pickle(CFG.PATH_TO_EMBEDDINGS)

CPU times: user 1d 23h 54min 44s, sys: 7h 13min 58s, total: 2d 7h 8min 43s
Wall time: 23h 46min 33s


In [383]:
def get_games_for_request(text_request: str, tokenizer, model, PATH_TO_EMBEDDINGS, k=5):
    df = pd.read_pickle(PATH_TO_EMBEDDINGS)#.iloc[:10]
    request_embeddings = get_text_embedding(clean_text(text_request), tokenizer, model)
    df['similarity'] = df.apply(lambda row: cosine_similarity([row[1:].to_list(), 
                                                               request_embeddings])[0][1], axis=1)
    df = df.sort_values(by='similarity', ascending=False)
    return df.boardgame_id.values[:k]

In [386]:
%%time
text_request = 'some of the types of monopoly games'

get_games_for_request(text_request=text_request, 
                      tokenizer=tokenizer, 
                      model=model, 
                      PATH_TO_EMBEDDINGS=CFG.PATH_TO_EMBEDDINGS)

CPU times: user 42.8 s, sys: 2.96 s, total: 45.8 s
Wall time: 47 s


array([233589, 162226,  94045,  27861, 233694])