In [1]:
import numpy as np
from gensim.models import Word2Vec
import nltk
import pickle
import faiss
import pandas as pd
import json

In [2]:
with open('../data/sequences.npy', 'rb') as f:
    urls = np.load(f, allow_pickle=True)

In [None]:
model = Word2Vec(sentences=urls, vector_size=100, window=5, min_count=1, workers=4)
model.save('../data/urls_vectorization.model')

urls_vocab = list(model.wv.index_to_key) 
with open('../data/urls_vocab.pkl', 'wb') as f:
    pickle.dump(urls_vocab, f)

In [2]:
urls_vectors = np.load('../data/urls_vectorization.model.wv.vectors.npy')
urls_vectors = urls_vectors.astype(np.float32)

with open('../data/urls_vocab.pkl', 'rb') as f:
    urls_vocab = pickle.load(f)

In [2]:
clusters_number = 128

In [None]:
kmeans = faiss.Kmeans(d=urls_vectors.shape[1], k=clusters_number, nredo=20, niter=500, gpu=True)
kmeans.train(urls_vectors)

clusters = kmeans.index.search(urls_vectors, 1)[1].astype(int).flatten()
np.save('../data/clusters.npy', clusters)

In [None]:
clusters_urls = [[] for _ in range(128)] 
for cluster, url in zip(clusters, urls_vocab):
    clusters_urls[cluster].append(url)

In [3]:
with open('../data/clustered_urls.pkl', 'rb') as f:
    clustered_urls = pickle.load(f)

In [3]:
clusters = np.load('../data/clusters.npy')

with open('../data/urls_vocab.pkl', 'rb') as f:
    urls_vocab = pickle.load(f)

urls_clusters = {}
for url, cluster in zip(urls_vocab, clusters):
    urls_clusters[url] = cluster

In [4]:
data = pd.read_table('../data/train')

In [5]:
X = data.drop(columns=['DEF', 'tokens'])

In [None]:
X['urls_hashed'] = X['urls_hashed'].astype(str)
X['urls_hashed'].fillna('')

urls_users = {}

for row in range(len(X['urls_hashed'])):
    values = X['urls_hashed'][row].split(' ')
    if len(values) > 1:
        for i in range(0, len(values), 2):
            if values[i] in urls_users:
                urls_users[values[i]][row] = int(values[i + 1])
            else:
                urls_users[values[i]] = {row: int(values[i + 1])}

    if row % 10000 == 0:
        print(f'Processed {row} rows')

In [None]:
count = 0
clusters_users = [{} for _ in range(clusters_number)] 
for url in urls_users:
    if url in urls_clusters:
        cluster = int(urls_clusters[url])
        for user in urls_users[url]:
            clusters_users[cluster][user] = urls_users[url][user]
        
    count += 1
    if count % 10000 == 0:
        print(f'Processed {count} urls')

In [9]:
with open('../data/clusters_users.json', 'w') as f:
    json.dump(clusters_users, f)

In [3]:
with open('../data/clusters_users.json', 'r') as f:
    clusters_users = json.load(f)

In [None]:
X = data.drop(columns=['DEF', 'tokens', 'urls_hashed'])
for i in range(clusters_number):
    X[f'url_cat_{i}'] = 0

In [None]:
for col_number in range(clusters_number):
    col = f'url_cat_{col_number}'
    count = 0
    for row in range(len(X[col])):
        if row in clusters_users[col_number]:
            X.loc[row, col] = clusters_users[col_number][row] 

        count += 1
        if count % 10000 == 0:
            print(f'Processed {count} users for columnt {col_number}')

In [13]:
print(X['url_cat_2'].value_counts())

0      241506
1       40914
2       13595
3        2961
4        1970
        ...  
127         1
241         1
68          1
91          1
254         1
Name: url_cat_2, Length: 110, dtype: int64


In [14]:
X.to_csv('../data/processed_train.csv', index=False)