In [1]:
import gzip
import bz2
import pickle
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import ngrams
from pymystem3 import Mystem
import re
from collections import Counter

In [2]:
from joblib import Parallel, delayed

In [3]:
import os

In [4]:
from tqdm import tqdm

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

from torch.autograd import Variable

In [7]:
def fixurl(url):
    res = url
    if res.startswith('http://'):
        res = res[7:]
        
    if res.startswith('https://'):
        res = res[8:]
        
    if res.startswith('www.'):
        res = res[4:]
        
    if res.endswith('/'):
        res = res[:-1]
        
    return res

In [8]:
url2id = dict()
with open('./url.data', 'r') as fin:
    for line in fin:
        idx, url = line.strip().split('\t')
    
# with these fixes len(url2id) = 582092 < 582167
        url = fixurl(url)
        
#         if url in url2id:
#             print(url)
    
        url2id[url] = idx

In [9]:
len(url2id)

582094

In [10]:
pattern = re.compile('\d+|[^\W\d]+')
id2title = dict()
with open('./titles.txt', 'r') as fin:
    for line in fin:
#         print(line)
        splits = line.strip().lower().split('\t')
        if len(splits) == 1:
            id2title[splits[0]] = ''
        else:
            id2title[splits[0]] = '#' + '#'.join(pattern.findall(splits[1])) + '#'

In [11]:
len(id2title)

582167

In [12]:
d3g = {}
with open('3gramms-total.dict', 'rb') as fin:
    d3g = pickle.load(fin)

In [13]:
keys = list(map(lambda x: x[0], d3g.most_common()[0:30000]))

In [14]:
cv = CountVectorizer(vocabulary=keys, analyzer='char', ngram_range=(3,3))

In [15]:
class SiameseNet(nn.Module):
    def __init__(self):
        super(SiameseNet, self).__init__()
        
        self.qn = nn.Sequential(
            nn.Linear(30000, 300),
            nn.Tanh(),
            nn.Linear(300, 300),
            nn.Tanh(),
            nn.Linear(300, 128),
        )
        
        self.dn = nn.Sequential(
            nn.Linear(30000, 300),
            nn.Tanh(),
            nn.Linear(300, 300),
            nn.Tanh(),
            nn.Linear(300, 128),
        )
        
        self.cos = nn.CosineSimilarity(dim=1)
    
    def forward(self, X1, X2):
        return self.cos(self.qn(X1), self.dn(X2))

In [16]:
model = SiameseNet()
softmax = nn.Softmax(dim=0)
loss = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

In [17]:
# y = Variable(torch.FloatTensor([1, 0, 0, 0, 0]), requires_grad=False)

In [18]:
def processB(lines):
    pattern = re.compile('\d+|[^\W\d]+')
    
    result_x1 = np.ndarray((0,30000))
    result_x2 = np.ndarray((0,30000))
    result_y = []
    
    for line in lines:
        splits = line.lower().strip().split('\t')
        text = '#' + '#'.join(pattern.findall(splits[0])) + '#'
        positive = set(splits[2].split(','))
        negative = set(splits[1].split(',')).difference(positive)
        
        for pos in positive:
            x1 = cv.transform([text for i in range(5)]).toarray()
            
            ds = [pos]
            
            if len(negative) >= 4:
                ds += list(np.random.permutation(list(negative))[0:4])
            else:
                ds += list(negative)
                ds += list(map(str, np.random.randint(582167, size=4-len(negative))))
            
            x2 = cv.transform([id2title[doc] for doc in ds]).toarray()
            
            result_x1 = np.append(result_x1, x1, axis=0)
            result_x2 = np.append(result_x2, x2, axis=0)
            result_y += [1, 0, 0, 0, 0]
    return result_x1, result_x2, result_y

In [19]:
#parallels work of creating batch
def get_batch(lines, sz):
    n_threads = 8
    per = sz // n_threads
    
    n_batches = len(lines) // sz
    
    for i in tqdm(range(n_batches)):
        res = Parallel(n_jobs=n_threads)(delayed(processB)(lines[i*sz + j*per : i*sz+(j+1)*per]) for j in range(n_threads))

#         return res

        res_x1 = np.vstack([res[j][0] for j in range(len(res))])
        res_x2 = np.vstack([res[j][1] for j in range(len(res))])
        res_y = np.hstack([res[j][2] for j in range(len(res))])
        
        yield res_x1, res_x2, res_y

In [20]:
# for fname in os.listdir('./clicks/filtered/'):
#     lines = []
#     with open('./clicks/filtered/' + fname, 'r') as fin:
#         lines = fin.readlines()
    
#     for line in tqdm(lines):
#         splits = line.lower().strip().split('\t')
#         text = splits[0]
#         positive = set(splits[2].split(','))
#         negative = set(splits[1].split(',')).difference(positive)
        
#         for pos in positive:
#             x1 = Variable(torch.FloatTensor(cv.transform([text for i in range(5)]).toarray()))
            
#             ds = [pos]
#             ds += list(map(str, np.random.randint(582167, size=4)))
            
#             x2 = Variable(torch.FloatTensor(cv.transform([id2title[doc] for doc in ds]).toarray()))
            
#             out = model(x1, x2)
#             sft = softmax(out)
#             l = loss(sft, y)
            
#             optimizer.zero_grad()
#             l.backward()
#             optimizer.step()

In [None]:
files = os.listdir('./clicks/filtered/')
for fname, cnt in zip(files, range(len(files))):
    lines = []
    with open('./clicks/filtered/' + fname, 'r') as fin:
        lines = fin.readlines()
    
    for x1, x2, y in get_batch(lines, 64):
            x1 = Variable(torch.FloatTensor(x1))
            x2 = Variable(torch.FloatTensor(x2))
            y = Variable(torch.FloatTensor(y), requires_grad=False)
            
            out = model(x1, x2)
            sft = softmax(out)
            l = loss(sft, y)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
    
    torch.save(model.state_dict(), './model_30_' + str(cnt) + '.state')

100%|██████████| 2858/2858 [2:31:04<00:00,  3.17s/it]  
100%|██████████| 2387/2387 [2:04:55<00:00,  3.14s/it]  
100%|██████████| 2409/2409 [2:17:48<00:00,  3.43s/it]  
100%|██████████| 2614/2614 [2:33:15<00:00,  3.52s/it]  
100%|██████████| 1245/1245 [1:12:35<00:00,  3.50s/it]
100%|██████████| 1625/1625 [1:28:54<00:00,  3.28s/it]
100%|██████████| 923/923 [49:37<00:00,  3.23s/it]
100%|██████████| 1269/1269 [1:11:01<00:00,  3.36s/it]
100%|██████████| 2469/2469 [2:16:22<00:00,  3.31s/it]  
100%|██████████| 1460/1460 [1:28:20<00:00,  3.63s/it]
100%|██████████| 1902/1902 [1:49:26<00:00,  3.45s/it]
100%|██████████| 1874/1874 [1:46:45<00:00,  3.42s/it]
100%|██████████| 1802/1802 [1:38:21<00:00,  3.27s/it]
 56%|█████▌    | 1740/3134 [1:39:24<1:19:38,  3.43s/it]

In [23]:
torch.save(model.state_dict(), './model2.state')

In [23]:
# torch.load('./model0.state')

In [25]:
# softmax(model(x1, x2))

In [28]:
pattern = re.compile('\d+|[^\W\d]+')

In [30]:
text = '13 причин почему'.lower()

In [31]:
text = '#' + '#'.join(pattern.findall(text)) + '#'

In [33]:
x1 = Variable(torch.FloatTensor(cv.transform([text for i in range(5)]).toarray()))
            
ds = ['113720', '543557', '443015', '461894', '461895']
            
x2 = Variable(torch.FloatTensor(cv.transform([id2title[doc] for doc in ds]).toarray()))
            
out = model(x1, x2)
sft = softmax(out)

In [34]:
out

tensor([ 0.0464,  0.1038,  0.0555,  0.0449,  0.0674])

In [35]:
softmax(out)

tensor([ 0.1965,  0.2082,  0.1983,  0.1962,  0.2007])

In [76]:
processB(lines[0:4])[0]

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [None]:
for x1, x2, y in get_batch(lines, 64):
    x1 = Variable(torch.FloatTensor(x1))
    x2 = Variable(torch.FloatTensor(x2))
    y = Variable(torch.FloatTensor(y), requires_grad=False)
    break

In [None]:
for res in get_batch(lines, 64):
    break