In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.neighbors import NearestNeighbors

In [19]:
df = pd.read_csv('data/data.csv')


In [29]:
df[df.ProductName == 'Творог'].drop_duplicates(subset=['MerchantName'])[['ProductCost', 'MerchantName']]

Unnamed: 0,ProductCost,MerchantName
1,40,Пятёрочка
20,60,Магнит
201,40,Перекресток


In [None]:
df = pd.read_csv('data/data.csv')
df = df.replace(r'\s', '', regex=True).applymap(str).applymap(str.lower).applymap(str.strip)

texts = df.ProductName.tolist()
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(texts)

rev_word_index = {v: k for k, v in tokenizer.word_index.items()}

In [None]:
tokenizer.word_index['пополнениевиртуальногосчёта']

In [None]:
import pickle

with open('objects/tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
total_length = len(df.groupby(['UserId', 'CheckId']))

max_check_items = 9

data = []

for i in range(len(df.groupby('UserId').count())):
    user_items = df[df.UserId == i]

    for j in range(max_check_items):
        check_items = user_items[user_items.CheckId == j]

        if len(check_items) == 0:
            break

        subdata = []
        for item_name in check_items.ProductName:
            subdata.append(tokenizer.word_index[item_name.lower()])

        data.append(subdata)

data = list(filter(lambda x: len(x) >= max_check_items, data))
data = list(map(lambda x: x[:max_check_items], data))

In [None]:
def train():
    train_x = np.asarray(data, dtype='float32').reshape(-1, max_check_items)
    train_y = train_x[:, -1].reshape(-1, 1)
    train_y = tf.keras.utils.to_categorical(train_y, num_classes=len(tokenizer.word_index))
    train_x = train_x[:, :max_check_items - 1]

    inputs = tf.keras.layers.Input(shape=(max_check_items - 1,))
    dense1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(inputs)
    dense2 = tf.keras.layers.Dense(128, activation=tf.nn.relu)(dense1)
    dense3 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(dense2)

    outputs = tf.keras.layers.Dense(len(tokenizer.word_index), activation=tf.nn.softmax)(dense3)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.build((-1, max_check_items - 1))

    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=['accuracy'],
    )

    history = model.fit(
        train_x,
        train_y, 
        batch_size=128,
        epochs=25,
        verbose=True,
    )

    return history


In [None]:
neighbors_length = 3

data_index = 0

nnm = NearestNeighbors(n_neighbors=neighbors_length, metric='cosine', algorithm='brute').fit(data)

distance, indices = nnm.kneighbors([data[data_index]])

neighbors = np.array(data)[indices.reshape(neighbors_length)]


In [None]:
user_id = 0

checks = df[df.UserId == user_id]