In [1]:
from pathlib import Path
from collections import Counter
import datetime
import json

import pandas as pd
import numpy as np
import scipy as sp
import implicit
from gensim.models import word2vec
from tqdm.auto import trange, tqdm
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import joblib

import feature_utils

In [2]:
data_root = Path('data/')
converted_data_path = data_root / 'data_converted'
features_dir = Path('user_features/')
features_root = Path('feature_transformers/')
embeddings_dir = Path('embeddings')

n_urls = 199683
n_users = 415317

domains_numerator = feature_utils.CatNumerator.load('feature_transformers/url_host.json')

In [3]:
def split_line(line):
    return list(map(int, line.split()))

# Histories preparation

In [None]:
users_histories = [[] for _ in range(n_users)]

def _add_history(x):
    users_histories[x['user_id'].values[0]] = list(x['url_host'])

for p in tqdm(list(converted_data_path.iterdir())):
    ds_part = pd.read_parquet(p, columns=['user_id', 'url_host', 'date', ]) \
        .sort_values('date', kind='stable')
    ds_part.groupby('user_id').apply(_add_history)

In [None]:
with open('users_histories.txt', 'wt') as hists_file:
    for h in users_histories:
        hists_file.write(' '.join(map(str, list(h))) + '\n')

In [None]:
%%time
with open('users_histories.txt') as hists_file, joblib.Parallel(batch_size=1024, n_jobs=-1) as pool:
    users_histories = pool(joblib.delayed(split_line)(line) for line in hists_file)

In [None]:
unique_urls, interaction_counters = [], []
with open('users_histories.txt') as hists_file:
    for line in hists_file:
        history = split_line(line)
        urls, counts = np.unique(history, return_counts=True)
        unique_urls.append(urls)
        interaction_counters.append(counts)
unique_urls = np.array(unique_urls, dtype=object)
interaction_counters = np.array(interaction_counters, dtype=object)

In [None]:
np.savez_compressed(
    'interactions_counters.npz', 
    unique_urls=unique_urls, interaction_counters=interaction_counters, 
    allow_pickle=True
)

In [4]:
interactions_counters_file = np.load('interactions/interactions_with_counters.npz', allow_pickle=True)
interactions_sets = interactions_counters_file['unique_urls']
interactions_counters = interactions_counters_file['interaction_counters']

In [6]:
np.save('interactions/interactions.npy', interactions_sets)
np.save('interactions/interaction_counters.npy', interactions_counters)

In [10]:
interactions_shares = np.array(
    [cs / cs.sum() for cs in interactions_counters],
    dtype=object
)

In [18]:
np.save('interactions/interaction_shares.npy', interactions_shares)

# URL normalization

In [4]:
def remove_short(url):
    parts = url.split('.')
    if len(parts) > 2 and len(parts[0]) == 1:
        return '.'.join(parts[1:])

    
def remove_turbo(url):
    if not url.endswith('turbopages.org') and not url.endswith('ampproject.org'):
        return '.'.join(
            p for p in url.split('.')
            if p != 'amp'
        )
    parts = url.split('.')
    if len(parts) == 2:
        return url
    return '.'.join(
        (p or '-') 
        for p in parts[0].split('-')
    ).replace('.-.', '-')


def collapse_ips(url):
    if url.startswith('192.168.'):
        return '192.168.0.1'
    return url


normalizers = [remove_short, remove_turbo, collapse_ips]

In [5]:
known_domains = set(domains_numerator.cats)


def normalize_url(url):
    for norm in normalizers:
        new_url = norm(url)
        if new_url in known_domains:
            url = new_url
    return url

In [6]:
one_part_pass_set = {'localhost', 'chrome-extension'}


def remove_one_part(url):
    return (len(url.split('.')) > 1) or (url in one_part_pass_set)


filters = [remove_one_part]

In [7]:
def check_url(url):
    return all(f(url) for f in filters)


def map_url(url):
    return domains_numerator.transform(normalize_url(url)) if check_url(url) else -1


url_id_mapping = np.array(list(map(map_url, domains_numerator.cats)))

In [8]:
def limit_l2(url):
    return '.'.join(url.split('.')[-2:])

l2_domains = sorted({limit_l2(url) for url in domains_numerator.cats})
l2_domains_indexes = {url: i for i, url in enumerate(l2_domains)}
l2_domain_remapping = np.array([l2_domains_indexes[limit_l2(url)] for url in domains_numerator.cats] + [-1])

url_id_mapping = l2_domain_remapping[url_id_mapping]

# Matrix factorizaion

## Loading data

In [9]:
users, domains, counts, dates = [], [], [], []
for part in feature_utils.read_dir(converted_data_path):
    users.append(part['user_id'].values)
    domains.append(part['url_host'].values)
    counts.append(part['request_cnt'].values)
    dates.append(part['date'].values)
users = np.concatenate(users)
domains = np.concatenate(domains)
counts = np.concatenate(counts)
dates = np.concatenate(dates)

  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
domains = url_id_mapping[domains]

retain_mask = domains != -1

domains = domains[retain_mask]
users = users[retain_mask]
counts = counts[retain_mask]
dates = dates[retain_mask]

## Training

In [11]:
all_interactions_matrix = sp.sparse.csr_matrix((counts, (users, domains)), shape=(n_users, n_urls))

In [12]:
params = {
    'factors': 256, 
    'iterations': 16, 
    'regularization': 50, 
    'alpha': 100
}

In [14]:
np.random.seed(42)
als = implicit.als.AlternatingLeastSquares(calculate_training_loss=True, random_state=42, **params)
als.fit(all_interactions_matrix)

  0%|          | 0/16 [00:00<?, ?it/s]

## Saving

In [15]:
users_embeddings = als.user_factors.to_numpy()
urls_embeddings = als.item_factors.to_numpy()

timestamp = datetime.datetime.now()
out_dir_path = embeddings_dir / f'als_l2_{timestamp:%m_%d_%H_%M}'
%mkdir {out_dir_path}

np.save(out_dir_path / 'users.npy', users_embeddings)
np.save(out_dir_path / 'urls.npy', urls_embeddings)
with open(out_dir_path / 'params.json', 'wt') as params_file:
    json.dump(params, params_file)

if 'url_id_mapping' in dir():
    np.save(out_dir_path / 'url_mapping.npy', url_id_mapping)

# Word2Vec

## Loading data

In [10]:
histories = []
with open('users_histories.txt') as hists_file:
    for line in hists_file:
        history = split_line(line)
        history = url_id_mapping[history]
        history = history[history >= 0].tolist()
        histories.append(history)

## Training

In [11]:
timestamp = datetime.datetime.now()
model_dir = embeddings_dir / f'w2v_256_l2_{timestamp:%m_%d_%H_%M}'
%mkdir {model_dir}

w2v_model = word2vec.Word2Vec(histories, workers=8, min_count=1, vector_size=256)
w2v_model.save(f'{model_dir}/model')

In [None]:
# w2v_model = word2vec.Word2Vec.load('embeddings/w2v_256')

## Sanity check

In [12]:
url = 'habr.com'
url_id = domains_numerator.transform(url)

for sim_id, _ in w2v_model.wv.most_similar(url_id):
    print(domains_numerator.inv_transform(sim_id))

wikipet-ru.turbopages.org
massaget.kz
normalnijhod.narod.ru
lodki-motors.ru
pugachevsky-site.ru
elenadektereva.ru
itznanie.ru
vbassejn.ru
zvezdagukovo.ru
septik27-ru.turbopages.org


In [14]:
url = 'habr.com'
url_id = l2_domains.index(url)

for sim_id, _ in w2v_model.wv.most_similar(url_id):
    print(l2_domains[sim_id])

moeobrazovanie.ru
russianblogs.com
losst.ru
itisgood.ru
linuxconfig.org
unity3d.ru
pythonru.com
python-scripts.com
xn----dtbqbibcfgew1b.xn--p1ai
gorndelo.ru


## User vectorization

In [None]:
url_counters = np.zeros(n_urls, np.int32)
for history in histories:
    np.add.at(url_counters, history, 1)

In [None]:
url_weights = (np.minimum(url_counters, 100) / 100) ** .5

In [None]:
dim = 256

def vectorize_user(user_history, weight=None, min_count=-1):
    history_vectors = np.array(list(map(w2v_model.wv.get_vector, user_history)))
    if min_count >= 0:
        count_mask = url_counters[user_history] >= min_count
        history_vectors = history_vectors[count_mask]
    if len(history_vectors) == 0:
        return np.zeros(dim)
    if weight == 'norm':
        history_weights = url_weights[user_history].reshape((-1, 1))
        return (history_vectors * history_weights).mean(axis=0)
    elif weight == 'weight':
        history_weights = url_weights[user_history].reshape((-1, 1))
        return (history_vectors * history_weights).sum(axis=0) / history_weights.sum()
    else:
        assert weight is None
    return history_vectors.mean(axis=0)

with joblib.Parallel(n_jobs=-1, batch_size=1024) as pool:
    users_embeddings = pool(joblib.delayed(vectorize_user)(history) for history in tqdm(histories))

  0%|          | 0/415317 [00:00<?, ?it/s]

In [18]:
users_embeddings = np.stack(users_embeddings)

## Saving

In [32]:
urls_embeddings = np.stack([model.wv.get_vector(i) for i in range(n_urls)])
# urls_embeddings = np.stack([w2v_model.wv.get_vector(i) for i in range(len(l2_domains))])

np.save(f'{model_dir}/urls.npy', urls_embeddings)
np.save(f'{model_dir}/users.npy', users_embeddings)
if 'url_id_mapping' in dir():
    np.save(model_dir / 'url_mapping.npy', url_id_mapping)