In [1]:
import gzip
import bz2
import pickle
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import ngrams
from pymystem3 import Mystem
import re
from collections import Counter

In [2]:
from joblib import Parallel, delayed

In [3]:
import os

In [4]:
from tqdm import tqdm

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
import numpy as np

In [5]:
#calculating TF

In [6]:
def process(idx, title, text):
    if os.path.exists('./data/' + idx + '.text'):
        return
    
    text = ' '.join(lemmatizer.lemmatize(text.lower()))
    title = ' '.join(lemmatizer.lemmatize(title.lower()))
    
    with open('./data/' + idx + '.text', 'wb') as fout:
        counts = Counter(pattern.findall(text))
        pickle.dump(counts, fout)
    with open('./data/' + idx + '.title', 'wb') as fout:
        counts = Counter(pattern.findall(title))
        pickle.dump(counts, fout)

In [31]:
# stemmer = SnowballStemmer('russian', ignore_stopwords=True)
cnt = 0
n_threads = 32
lemmatizer = Mystem()
pattern = re.compile('\d+|[^\W\d]+')

In [None]:
with gzip.open('./docs.tsv.gz', 'rb') as f:
    params = [('', '', '') for i in range(n_threads)]
    for line in f:
        if cnt != 0 and cnt % n_threads == 0:
            Parallel(n_jobs=n_threads)(delayed(process)(*params[j]) for j in range(n_threads))

        idx, title, text = line.decode('utf-8').split('\t')
        params[cnt % n_threads] = (idx, title, text)
        cnt += 1

In [22]:
#count files length

In [24]:
pattern = re.compile('\d+|[^\W\d]+')

In [25]:
title_lengths = dict()
text_lengths = dict()

In [40]:
with open('lengths.title', 'r') as fin:
    for line in fin:
        spl = line.strip().split(' ')
        title_lengths[spl[0]] = int(spl[2])

In [47]:
sum(title_lengths.values()) / len(title_lengths)

8.3285758210273

In [48]:
with open('lengths.text', 'r') as fin:
    for line in fin:
        spl = line.strip().split(' ')
        text_lengths[spl[0]] = int(spl[2])

In [49]:
sum(text_lengths.values()) / len(title_lengths)

9717.212767126959

In [50]:
with open('./lengths.title', 'wb') as fout:
    pickle.dump(title_lengths, fout)

In [51]:
with open('./lengths.text', 'wb') as fout:
    pickle.dump(text_lengths, fout)

In [4]:
#removing numbers of 5+ digits

In [20]:
files = os.listdir('./data')
n_threads = 8
per = len(files) // n_threads + 1

In [22]:
dpattern = re.compile('\d{5}\d*')

In [24]:
def process1(idx, fnames):
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 15000 == 0:
            print(idx, 'finished', cnt)
        dc = {}
        with open('./data/' + fname, 'rb') as fin:
            dc = pickle.load(fin)
            
        ndc = dict(filter(lambda x: not dpattern.match(x[0]), dc.items()))
        with open('./data/' + fname, 'wb') as fout:
            pickle.dump(ndc, fout)

In [None]:
Parallel(n_jobs=n_threads)(delayed(process1)(j, files[j*per : (j+1)*per]) for j in range(n_threads))

In [6]:
#stemming and filter stopwords

In [12]:
stemmer = SnowballStemmer('russian', ignore_stopwords=True)
stop = stopwords.words('russian')

In [16]:
def process2(idx, fnames):
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 15000 == 0:
            print(idx, 'finished', cnt)
        dc = {}
        with open('./data/' + fname, 'rb') as fin:
            dc = pickle.load(fin)

        ndc = filter(lambda x: not (x[0] in stop), dc.items())
        ndc = dict(map(lambda x: (stemmer.stem(x[0]), x[1]), ndc))
        with open('./data/' + fname, 'wb') as fout:
            pickle.dump(ndc, fout)

In [None]:
Parallel(n_jobs=n_threads)(delayed(process2)(j, files[j*per : (j+1)*per]) for j in range(n_threads))

In [None]:
! echo 'hello' > log.txt

In [None]:
#fix queries layout

In [6]:
_eng_chars = u"~!@#$%^&qwertyuiop[]asdfghjkl;'zxcvbnm,./QWERTYUIOP{}ASDFGHJKL:\"|ZXCVBNM<>?"
_rus_chars = u"ё!\"№;%:?йцукенгшщзхъфывапролджэячсмитьбю.ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭ/ЯЧСМИТЬБЮ,"
_trans_table = dict(zip(_eng_chars, _rus_chars))
 
def fix_layout(s):
    return u''.join([_trans_table.get(c, c) for c in s])

In [13]:
#partial words set
ds = set()
for f in tqdm(os.listdir('./data')[0:60000]):
    with open('./data/' + f, 'rb') as fin:
        ds.update(pickle.load(fin).keys())

100%|██████████| 60000/60000 [00:25<00:00, 2399.78it/s]


In [32]:
enw = re.compile('^[a-zA-Z]+$')
def fix(s):
    if not enw.match(s):
        return s
    fx = stemmer.stem(lemmatizer.lemmatize(fix_layout(s))[0])
    
    if (fx in ds) or fix_layout(s) in stop:
        return fix_layout(s)
    
    return s

In [33]:
fout = open('./queries.fix', 'w')

In [34]:
with open('./queries.tsv', 'r') as fin:
    for line in fin:
        idx, line = line.split('\t')
        words = pattern.findall(line)
        res = ' '.join([fix(word) for word in words])
        fout.write(idx + '\t' + res + '\n')

In [35]:
fout.close()

In [38]:
# create total words dict

In [21]:
def processd(idx, fnames):
    cnt = 0
    df = Counter()
    buf = Counter()
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 1000 == 0:
            df += buf
            buf.clear()
            
        if (cnt + 1) % 5000 == 0:
            print(idx, 'finished', cnt)
        
        pr = Counter()
        with open('./data/' + fname, 'rb') as fin:
            pr = Counter(pickle.load(fin))
        buf += pr
    df += buf
    with open(str(idx) + '.dict', 'wb') as fout:
        pickle.dump(df, fout)

In [22]:
files = os.listdir('./data')
n_threads = 30
per = len(files) // n_threads + 1

In [None]:
Parallel(n_jobs=n_threads)(delayed(processd)(j, files[j*per : (j+1)*per]) for j in range(n_threads))

In [1]:
df = Counter()
for i in range(30):
    with open(str(i) + '.dict', 'rb') as fin:
        df += pickle.load(fin)

In [None]:
df.pop('')
with open('./total.dict', 'wb') as fout:
    pickle.dump(df, fout)

In [2]:
# tf and df for query words

In [13]:
qw = set()
with open('queries.final', 'r') as fin:
    for line in fin:
        idx, line = line.split('\t')
        words = pattern.findall(line)
        for word in words:
            w = stemmer.stem(lemmatizer.lemmatize(word)[0])
            qw.add(w)

In [14]:
len(qw)

8922

In [15]:
totalf = dict()
with open('total.dict', 'rb') as fin:
    totalf = pickle.load(fin)

In [16]:
qw2 = set()
for w in qw:
    if not (w in totalf):
        continue
    qw2.add(w)
qw = qw2

In [17]:
len(qw)

8895

In [18]:
titlefiles = !ls ./data | grep '\.title'
textfiles = !ls ./data | grep '\.text'

In [20]:
def processF(idx, fnames):
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 10000 == 0:
            print(idx, 'finished', cnt)
        
        if os.path.exists('./frequences/' + fname):
            continue
        
        dc = {}
        with open('./data/' + fname, 'rb') as fin:
            dc = pickle.load(fin)
        dc = dict(list(filter(lambda x: x[0] in qw, dc.items())))
        with open('./frequences/' + fname, 'wb') as fout:
            pickle.dump(dc, fout)

In [21]:
n_threads = 8
per = len(titlefiles) // n_threads + 1

In [None]:
Parallel(n_jobs=n_threads)(delayed(processF)(j, titlefiles[j*per : (j+1)*per]) for j in range(n_threads))

In [23]:
Parallel(n_jobs=n_threads)(delayed(processF)(j, textfiles[j*per : (j+1)*per]) for j in range(n_threads))

3 finished 9999
6 finished 9999
2 finished 9999
0 finished 9999
7 finished 9999
5 finished 9999
1 finished 9999
4 finished 9999
6 finished 19999
7 finished 19999
2 finished 19999
0 finished 19999
3 finished 19999
1 finished 19999
5 finished 19999
4 finished 19999
6 finished 29999
7 finished 29999
2 finished 29999
0 finished 29999
1 finished 29999
3 finished 29999
5 finished 29999
7 finished 39999
4 finished 29999
6 finished 39999
2 finished 39999
0 finished 39999
1 finished 39999
5 finished 39999
3 finished 39999
6 finished 49999
7 finished 49999
4 finished 39999
2 finished 49999
3 finished 49999
0 finished 49999
5 finished 49999
1 finished 49999
6 finished 59999
7 finished 59999
2 finished 59999
4 finished 49999
3 finished 59999
0 finished 59999
5 finished 59999
1 finished 59999
2 finished 69999
6 finished 69999
7 finished 69999
4 finished 59999
0 finished 69999
3 finished 69999
5 finished 69999
1 finished 69999
4 finished 69999


[None, None, None, None, None, None, None, None]

In [6]:
title_df = Counter()
text_df = Counter()

In [7]:
def processDF(idx, fnames):
    df = Counter()
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 10000 == 0:
            print(idx, 'finished', cnt+1)
            
        with open('./frequences/' + fname, 'rb') as fin:
            df.update(pickle.load(fin).keys())
    
    return df

In [8]:
n_threads = 8
per = len(textfiles) // n_threads + 1

In [9]:
title_dfs = Parallel(n_jobs=n_threads)(delayed(processDF)(j, titlefiles[j*per : (j+1)*per]) for j in range(n_threads))

1 finished 10000
0 finished 10000
2 finished 10000
3 finished 10000
4 finished 10000
7 finished 10000
6 finished 10000
5 finished 10000
2 finished 20000
1 finished 20000
0 finished 20000
3 finished 20000
7 finished 20000
4 finished 20000
6 finished 20000
5 finished 20000
2 finished 30000
1 finished 30000
0 finished 30000
3 finished 30000
7 finished 30000
4 finished 30000
6 finished 30000
5 finished 30000
2 finished 40000
1 finished 40000
3 finished 40000
0 finished 40000
7 finished 40000
4 finished 40000
6 finished 40000
5 finished 40000
2 finished 50000
1 finished 50000
3 finished 50000
0 finished 50000
7 finished 50000
4 finished 50000
6 finished 50000
5 finished 50000
2 finished 60000
1 finished 60000
3 finished 60000
0 finished 60000
7 finished 60000
4 finished 60000
6 finished 60000
5 finished 60000
1 finished 70000
2 finished 70000
0 finished 70000
3 finished 70000
7 finished 70000
5 finished 70000
4 finished 70000
6 finished 70000


In [10]:
for df in title_dfs:
    title_df += df

In [11]:
with open('./title.df', 'wb') as fout:
    pickle.dump(title_df, fout)

In [13]:
text_dfs = Parallel(n_jobs=n_threads)(delayed(processDF)(j, textfiles[j*per : (j+1)*per]) for j in range(n_threads))

2 finished 10000
6 finished 10000
0 finished 10000
2 finished 20000
6 finished 20000
4 finished 10000
0 finished 20000
3 finished 10000
7 finished 10000
5 finished 10000
1 finished 10000
4 finished 20000
7 finished 20000
3 finished 20000
1 finished 20000
5 finished 20000
6 finished 30000
2 finished 30000
0 finished 30000
7 finished 30000
1 finished 30000
3 finished 30000
5 finished 30000
7 finished 40000
4 finished 30000
6 finished 40000
2 finished 40000
0 finished 40000
1 finished 40000
5 finished 40000
3 finished 40000
6 finished 50000
7 finished 50000
4 finished 40000
2 finished 50000
3 finished 50000
0 finished 50000
5 finished 50000
1 finished 50000
6 finished 60000
7 finished 60000
2 finished 60000
4 finished 50000
3 finished 60000
0 finished 60000
5 finished 60000
1 finished 60000
2 finished 70000
6 finished 70000
7 finished 70000
4 finished 60000
0 finished 70000
3 finished 70000
5 finished 70000
1 finished 70000
4 finished 70000


In [14]:
for df in text_dfs:
    text_df += df

In [15]:
with open('./text.df', 'wb') as fout:
    pickle.dump(text_df, fout)

In [57]:
#total document frequency (no dependency on zone)

In [24]:
def processTDF(idx, fnames):
    df = Counter()
    for fname, cnt in zip(fnames, range(len(fnames))):
        if (cnt + 1) % 10000 == 0:
            print(idx, 'finished', cnt+1)
            
        doc_idx = fname.split('.')[0]
        
        ks1 = []
        ks2 = []
        with open('./frequences/' + doc_idx + '.title', 'rb') as fin:
            ks1 = pickle.load(fin).keys()
        with open('./frequences/' + doc_idx + '.text', 'rb') as fin:
            ks2 = pickle.load(fin).keys()

        df.update(set(ks1) | set(ks2))
    
    return df

In [25]:
n_threads = 8
per = len(textfiles) // n_threads + 1

In [26]:
total_dfs = Parallel(n_jobs=n_threads)(delayed(processTDF)(j, textfiles[j*per : (j+1)*per]) for j in range(n_threads))

3 finished 10000
6 finished 10000
2 finished 10000
0 finished 10000
7 finished 10000
5 finished 10000
1 finished 10000
4 finished 10000
6 finished 20000
7 finished 20000
2 finished 20000
0 finished 20000
3 finished 20000
1 finished 20000
5 finished 20000
4 finished 20000
6 finished 30000
7 finished 30000
2 finished 30000
0 finished 30000
1 finished 30000
3 finished 30000
5 finished 30000
4 finished 30000
7 finished 40000
6 finished 40000
2 finished 40000
0 finished 40000
1 finished 40000
5 finished 40000
3 finished 40000
6 finished 50000
7 finished 50000
4 finished 40000
2 finished 50000
3 finished 50000
0 finished 50000
5 finished 50000
1 finished 50000
6 finished 60000
7 finished 60000
2 finished 60000
4 finished 50000
3 finished 60000
0 finished 60000
5 finished 60000
1 finished 60000
2 finished 70000
6 finished 70000
7 finished 70000
4 finished 60000
0 finished 70000
3 finished 70000
5 finished 70000
1 finished 70000
4 finished 70000


In [27]:
total_df = Counter()

In [28]:
for df in total_dfs:
    total_df += df

In [31]:
with open('./total.df', 'wb') as fout:
    pickle.dump(total_df, fout)

In [29]:
len(total_df)

8895

In [6]:
# create 3-gramms dictionary for DSSM

In [9]:
pattern = re.compile('\d+|[^\W\d]+')

In [58]:
d3 = Counter()

In [59]:
cnt = 0
with open('titles.txt', 'r') as fin:
    for line in fin:
        cnt += 1
        
        if cnt % 20000 == 0:
            print('processed', cnt)
        
        title = line.split('\t')[1].lower()
        title = '#' + '#'.join(pattern.findall(title)) + '#'
        gg = filter(lambda x: x[1] != '#', ngrams(list(title), 3))
        grams3 = map(lambda x: ''.join(x), gg)
        d3.update(grams3)

processed 20000
processed 40000
processed 60000
processed 80000
processed 100000
processed 120000
processed 140000
processed 160000
processed 180000
processed 200000
processed 220000
processed 240000
processed 260000
processed 280000
processed 300000
processed 320000
processed 340000
processed 360000
processed 380000
processed 400000
processed 420000
processed 440000
processed 460000
processed 480000
processed 500000
processed 520000
processed 540000
processed 560000
processed 580000


In [60]:
len(d3)

42365

In [43]:
d3.most_common()[0:10]

[('ть#', 186925),
 ('#по', 168149),
 ('#пр', 152950),
 ('#на', 139423),
 ('#ка', 138145),
 ('на#', 130092),
 ('ия#', 126663),
 ('#в#', 125282),
 ('ие#', 118564),
 ('ени', 118020)]

In [65]:
list(filter(lambda x: ' ' in x , list(d3.keys())))

[]

In [44]:
with open('3gramms.dict', 'wb') as fout:
    pickle.dump(d3, fout)

In [6]:
def process3G(idx, files):
    d3g = Counter()
    for fname, cnt in zip(files, range(len(files))):
        with bz2.BZ2File('./clicks/2017/' + fname, 'r') as fin:
            for line in fin:
                text = line.decode('utf-8').split('@')[0].lower()
                text = '#' + '#'.join(pattern.findall(text)) + '#'
                gg = filter(lambda x: x[1] != '#', ngrams(list(text), 3))
                grams3 = map(lambda x: ''.join(x), gg)
                d3g.update(grams3)
        if (cnt + 1) % 10 == 0:
            print(idx, 'processed', cnt)
            
    return d3g

In [16]:
files = os.listdir('clicks/2017/')
n_threads = 32
per = len(files) // n_threads + 1

In [17]:
d3gs = Parallel(n_jobs=n_threads)(delayed(process3G)(j, files[j*per : (j+1)*per]) for j in range(n_threads))

26 processed 9
13 processed 9
5 processed 9
21 processed 9
24 processed 9
18 processed 9
8 processed 9
27 processed 9
11 processed 9
19 processed 9
16 processed 9
10 processed 9
15 processed 9
23 processed 9
22 processed 9
2 processed 9
9 processed 9
20 processed 9
0 processed 9
30 processed 9
3 processed 9
25 processed 9
7 processed 9
14 processed 9
26 processed 19
29 processed 9
1 processed 9
28 processed 9
10 processed 19
6 processed 9
17 processed 9
31 processed 9
24 processed 19
12 processed 9
4 processed 9
13 processed 19
8 processed 19
23 processed 19
21 processed 19
16 processed 19
27 processed 19
5 processed 19
15 processed 19
3 processed 19
7 processed 19
13 processed 29
20 processed 19
11 processed 19
14 processed 19
19 processed 19
2 processed 19
0 processed 19
30 processed 19
22 processed 19
25 processed 19
17 processed 19
29 processed 19
23 processed 29
24 processed 29
1 processed 19
18 processed 19
28 processed 19
6 processed 19
4 processed 19
21 processed 29
15 processe

In [18]:
d3g = Counter()
for d in d3gs:
    d3g += d

In [20]:
with open('3gramms.dict', 'rb') as fin:
    d3g += pickle.load(fin)

In [23]:
len(d3g)

162791

In [24]:
with open('3gramms-total.dict', 'wb') as fout:
    pickle.dump(d3g, fout)

In [36]:
d3g.most_common()[85000]

('ыщё', 10)

In [None]:
d3g

In [6]:
d3g = {}
with open('3gramms-total.dict', 'rb') as fin:
    d3g = pickle.load(fin)

In [6]:
# prepare dataset for dssm

In [6]:
def fixurl(url):
    res = url
    if res.startswith('http://'):
        res = res[7:]
        
    if res.startswith('https://'):
        res = res[8:]
        
    if res.startswith('www.'):
        res = res[4:]
        
    if res.endswith('/'):
        res = res[:-1]
        
    return res

In [7]:
url2id = dict()
with open('./url.data', 'r') as fin:
    for line in fin:
        idx, url = line.strip().split('\t')
    
# with these fixes len(url2id) = 582092 < 582167
        url = fixurl(url)
        
#         if url in url2id:
#             print(url)
    
        url2id[url] = idx

In [8]:
len(url2id)

582094

In [9]:
id2title = dict()
with open('./titles.txt', 'r') as fin:
    for line in fin:
#         print(line)
        splits = line.strip().lower().split('\t')
        if len(splits) == 1:
            id2title[splits[0]] = ''
        else:
            id2title[splits[0]] = splits[1]

In [10]:
len(id2title)

582167

In [17]:
def processCL(idx, files):
    samples = []
    cur_showed = set()
    cur_clicked = set()
    cur_q = ''

    for fname, cnt in zip(files, range(len(files))):
        with bz2.BZ2File('./clicks/2017/' + fname) as fin:
            for line in fin:
                line = line.decode('utf-8').strip()
                q, showed, clicked = line.split('\t')[:3]
                q = q.split('@')[0]
            
                if q != cur_q:
                    if cur_q != '' and len(cur_clicked) != 0:
                        samples.append('\t'.join([cur_q, ','.join(cur_showed), ','.join(cur_clicked)]))
                    cur_showed.clear()
                    cur_clicked.clear()
                    cur_q = q
            
                showed = map(lambda x: url2id[x], filter(lambda x: x in url2id, map(fixurl, showed.split(','))))
                clicked = map(lambda x: url2id[x], filter(lambda x: x in url2id, map(fixurl, clicked.split(','))))
            
                cur_showed.update(showed)
                cur_clicked.update(clicked)
        
            if cur_q != '' and len(cur_clicked) != 0:
                samples.append('\t'.join([cur_q, ','.join(cur_showed), ','.join(cur_clicked)]))
            cur_showed.clear()
            cur_clicked.clear()
            cur_q = ''
        
        if (cnt + 1) % 10 == 0:
            print(idx, 'processed', cnt+1)
            
            with open('./clicks/filtered/' + str(idx) + '_' + str(cnt) + '.txt', 'w') as fout:
                fout.write('\n'.join(samples))
                samples.clear()
                
    if len(samples) != 0:
        with open('./clicks/filtered/' + str(idx) + '_' + str(cnt) + '.txt', 'w') as fout:
            fout.write('\n'.join(samples))
            samples.clear()

In [18]:
files = os.listdir('./clicks/2017/')
n_threads = 32
per = len(files) // n_threads + 1

In [19]:
Parallel(n_jobs=n_threads)(delayed(processCL)(j, files[j*per : (j+1)*per]) for j in range(n_threads))

13 processed 10
26 processed 10
5 processed 10
24 processed 10
21 processed 10
27 processed 10
8 processed 10
11 processed 10
16 processed 10
18 processed 10
15 processed 10
10 processed 10
19 processed 10
23 processed 10
20 processed 10
2 processed 10
9 processed 10
22 processed 10
0 processed 10
25 processed 10
7 processed 10
3 processed 10
30 processed 10
14 processed 10
31 processed 10
1 processed 10
17 processed 10
6 processed 10
4 processed 10
28 processed 10
29 processed 10
26 processed 20
10 processed 20
24 processed 20
13 processed 20
23 processed 20
8 processed 20
16 processed 20
21 processed 20
12 processed 10
27 processed 20
15 processed 20
5 processed 20
20 processed 20
3 processed 20
11 processed 20
7 processed 20
13 processed 30
14 processed 20
0 processed 20
2 processed 20
19 processed 20
17 processed 20
25 processed 20
30 processed 20
22 processed 20
29 processed 20
1 processed 20
24 processed 30
4 processed 20
28 processed 20
18 processed 20
6 processed 20
31 processe

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [47]:
len(samples)

5722

In [None]:
samples

In [11]:
arr = np.array([10, 11, 32, 14, 12])

In [16]:
np.random.permutation(arr)[0:4]

array([10, 12, 32, 11])