In [31]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import pandas as pd
import numpy as np
import random
from random import choice
from string import ascii_lowercase

from collections import defaultdict

import datetime
import pickle

In [32]:
from assistant import Shop

SHOPS_IMPRINTS = '../StaticData/shop_imprints.pickle_bk'

shop_vecs = {}
shops_imprints = pickle.load(open(SHOPS_IMPRINTS, 'rb'))
for shop_imprint, shop in shops_imprints:
    shop_vecs[shop.name] = shop_imprint

print(len(shop_vecs.keys()))

149


In [33]:
class HistoricalPlacesGenerator(object):
    SHOPS_CSV = '../StaticData/shops.csv'
    
    @staticmethod
    def generate_string():
        n = random.randint(10, 20)
        return "".join(choice(ascii_lowercase) for i in range(n))
    
    @staticmethod
    def generate_email():
        return HistoricalPlacesGenerator.generate_string() + "@" + \
            random.choice(["mail.ru", "yandex.ru", "google.com"])
    
    @staticmethod
    def generate_date(start_date=datetime.datetime(2019, 8, 1), random_duration=60):
        day_duration = 60 * 60 * 24
        time_step = day_duration * random.randint(0, random_duration)
        return start_date + datetime.timedelta(0, time_step)
    
    @staticmethod
    def generate_times(date, n):
        def generate_time():
            start_date = datetime.datetime(2019, 8, 1)
            time_step = 60 * random.randint(0, 60 * 24)
            return start_date + datetime.timedelta(0, time_step)
        return sorted([generate_time() for _ in range(n)])
    
    @staticmethod
    def get_events_count(date):
        weekno = datetime.datetime.today().weekday()
        if weekno < 5:
            count = random.randint(9, 27)
        else:
            count = random.randint(21, 48)
        return count

    def __init__(self, nprofiles=1000):
        self.character_to_shop = {}
        self.profiles = self._generate_profiles(nprofiles)
    
    def _generate_profiles(self, nprofiles, diff_chars=3):
        df = pd.read_csv(self.SHOPS_CSV)[['name', 'searchTags']]
        self.character_to_shop = {}
        for _, row in df.iterrows():
            if not isinstance(row['searchTags'], str):
                searchTags = [row['name']]
            else:
                searchTags = row['searchTags'].split('|')
            for tag in searchTags:
                self.character_to_shop[tag] = row['name']
        characters = list(self.character_to_shop.keys())
        return [
            {'email': HistoricalPlacesGenerator.generate_email(),
             'profile': random.sample(characters, diff_chars)
            } for _ in range(nprofiles)]
    
    def _choose_profile(self):
        return random.choice(self.profiles)
    
    def _generate_places(self):
        profile = self._choose_profile()
        def generate_place():
            char = random.choice(profile['profile'])
            return self.character_to_shop[char]
        
        date = HistoricalPlacesGenerator.generate_date()
        places = [generate_place() for _ in range(HistoricalPlacesGenerator.get_events_count(date))]
        times = HistoricalPlacesGenerator.generate_times(date, len(places))
        return profile, date, places, times

    def generate_row(self):
        profile, date, places, times = self._generate_places()
        return {
            'email': profile['email'],
            'date': date.strftime("%Y-%m-%d"),
            'places': places,
            'times': list(map(lambda time: time.strftime("%H:%M"), times))
        }

    def generate_dataset(self, nrows=10):
        rows = [self.generate_row() for _ in range(nrows)]
        return pd.DataFrame(rows)


historical_places_generator = HistoricalPlacesGenerator()
df = historical_places_generator.generate_dataset(nrows=10000)

df.head()

Unnamed: 0,date,email,places,times
0,2019-08-26,tebqnkrqnngumcmgijq@yandex.ru,"[Accessorize, NIKE, Accessorize, NIKE, NIKE, M...","[01:06, 01:47, 01:54, 02:10, 03:16, 05:33, 07:..."
1,2019-09-03,quweykzbwo@mail.ru,"[Mlesna, Mlesna, MANGO, Vans, Vans, MANGO, Van...","[00:20, 00:45, 00:50, 01:44, 02:12, 02:16, 03:..."
2,2019-08-11,cozjjnfuafmasidri@mail.ru,"[MANGO, MANGO, MANGO, Calvin Klein Jeans, Calv...","[00:01, 00:26, 00:42, 01:12, 01:24, 01:39, 01:..."
3,2019-08-09,nqyadzvauvachmqrqab@google.com,"[Brown Art, THE BODY SHOP, THE BODY SHOP, THE ...","[01:24, 02:47, 03:43, 05:09, 06:22, 07:34, 09:..."
4,2019-08-23,jfxqtsdawd@yandex.ru,"[Yves Rocher, Crocs, Crocs, Yves Rocher, Yves ...","[00:34, 00:55, 01:32, 01:48, 02:42, 03:09, 03:..."


In [34]:
class PlacesIndexesMapping(object):
    # shop_vecs: name -> vector
    def __init__(self, df, shop_vecs):
        self.n_places = 0
        self.place_to_idx = {}
        self.idx_to_place = {}
        self._make_mapping(df)
        self._shop_vecs = shop_vecs
    
    def _make_mapping(self, df):
        for idx, row in df[['places']].iterrows():
            places_line = row['places']
            for place in places_line:
                if place not in self.place_to_idx:
                    self.place_to_idx[place] = self.n_places
                    self.idx_to_place[self.n_places] = place
                    self.n_places += 1

indexer = PlacesIndexesMapping(df, shop_vecs)
indexer.n_places

143

In [35]:
class DataInjestor(object):
    def __init__(self, df, shop_vecs):
        self.indexer = PlacesIndexesMapping(df, shop_vecs)
        self._places_lines_idxs = []
        self._places_lines_vecs = []
        self._prepare_places(df)
    
    def _prepare_places(self, df):
        df_sorted = df.sort_values(by='date')
        time_seriess = defaultdict(list)
        for _, row in df_sorted.iterrows():
            time_seriess[row['email']].append(row['places'])
        for email, llst in time_seriess.items():
            places_line_idxs = [self.indexer.place_to_idx[x] for y in llst for x in y]
            self._places_lines_idxs.append(places_line_idxs)
            places_line_vecs = [shop_vecs[x] for y in llst for x in y]
            self._places_lines_vecs.append(places_line_vecs)
    
    def get_size(self):
        return self.indexer.n_places
    
    def sample(self):
        history_available = [40, 50]
        result = random.choice(self._places_lines_idxs)
        if len(result) <= history_available[1]:
            return result
        drop_left_max = len(result) - history_available[1]
        result = result[random.randint(0, drop_left_max):]

        chop_idx = random.randint(history_available[0], history_available[1])
        result = result[:chop_idx]
        return result
    
    def sample_vec(self):
        history_available = [30, 40]
        result = random.choice(self._places_lines_vecs)
        if len(result) <= history_available[1]:
            return result
        drop_left_max = len(result) - history_available[1]
        result = result[random.randint(0, drop_left_max):]

        chop_idx = random.randint(history_available[0], history_available[1])
        result = result[:chop_idx]
        return result
    
    def convert_idx_to_place(self, idx):
        return self.indexer.idx_to_place[idx]


injestor = DataInjestor(df, shop_vecs)
print(len(injestor.sample_vec()))

36


In [62]:
def inputTensor(vecs):
    return torch.tensor(vecs[:-1]).reshape(300, 1, -1)

def targetTensor(vecs):
    return torch.tensor(vecs[1:]).reshape(300, 1, -1)

In [63]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.lstm = nn.LSTM(input_size, hidden_size, 1)
        self.hidden2out = nn.Linear(hidden_size, output_size)
    
    def forward(self, inp):
        lstm_out, _ = self.lstm(inp)
        out = self.hidden2out(lstm_out)
        return out

In [64]:
def randomTrainingExample():
    sample = injestor.sample_vec()
    inp = inputTensor(sample)
    out = targetTensor(sample)
    return inp, out

In [65]:
def train(rnn, optimizer, criterion, src_tensor, tgt_tensor):
    rnn.zero_grad()
    loss = 0
    
    output = rnn(src_tensor.view(src_tensor.size(2), 1, -1))
    loss = torch.sqrt(criterion(output, tgt_tensor.reshape(tgt_tensor.size(2), 1, -1)))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [67]:
rnn = RNN(300, 32, 300)

learning_rate = 1e-3
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

n_iters = 10000
print_every = 500
plot_every = 500
all_losses = []
total_loss = 0


for iter in range(1, n_iters + 1):
    output, loss = train(rnn, optimizer, criterion, *randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
        print('%.4f' % (loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

0.2022
0.1901
0.2014
0.0715
0.2101
0.0210
0.2436
0.0887
0.2039
0.2124
0.1972
0.1048
0.2061
0.0071
0.2015
0.0084
0.1055
0.1986
0.0986
0.1044


In [None]:
def find_closest(vec):
    distances = []
    for shop, vec2 in shop_vecs.items():
        distances.append((((vec - vec2)**2).sum(), shop))
    return sorted(distances)[1]

def predict(src_tensor):
    rnn.eval()
    output = rnn(src_tensor.view(src_tensor.size(2), 1, -1))
    return find_closest(output[0, 0].detach().numpy())

vec = inputTensor(injestor.sample_vec())
predict(vec)

In [None]:
# dump model
torch.save(rnn.state_dict(), '../StaticData/model.torch')