In [29]:
import os
import bz2
import pickle as pkl

import pandas as pd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [28]:
files = [file for file in os.listdir('data/2017') if file.endswith('bz2')]

In [37]:
url2id_total = {row[1]:row[0] for i, row in pd.read_csv('data/url.data/url.data', sep='\t', header=None).iterrows()}

In [46]:
queries = {row[1]:row[0] for i, row in pd.read_csv('data/queries.tsv', sep='\t', header=None).iterrows()}

In [77]:
def process(filename):
    out = {}
    error_count = 0
    
    with bz2.open(f'data/2017/{filename}', 'rt') as f:
        for line in tqdm(f):
            try:
                query, shown, clicked, timestamps = line.split('\t')

                query_extracted = query.split('@')[0]

                if query_extracted in queries:
                    query_id = queries[query_extracted]
                else:
                    query_id = None

                shown = shown.split(',h')
                shown[0] = shown[0][1:]
                clicked = clicked.split(',h')

                if len(clicked) != 0:
                    clicked[0] = clicked[0][1:]

                clicked = set(clicked)

                for url in shown:
                    after_http = url.split('//')[1] if '//' in url else url

                    if after_http in url2id_total:
                        idx = url2id_total[after_http]

                        if idx not in out:
                            out[idx] = [0, 0, {}]

                        out[idx][0] += 1
                        out[idx][1] += int(url in clicked)

                        if query_id:
                            if query_id not in out[idx][2]:
                                out[idx][2][query_id] = [0, 0]

                            out[idx][2][query_id][0] += 1
                            out[idx][2][query_id][1] += int(url in clicked)



            except:
                error_count += 1
            
    with open(f'data/2017/{filename}.pkl', 'wb') as f:
        pkl.dump(out, f)
        
    return error_count

In [78]:
Parallel(n_jobs=8)(delayed(process)(filename) for filename in files)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [79]:
all_dicts = []

files = [file for file in os.listdir('data/2017') if file.endswith('pkl')]

for filename in files:
    with open(f'data/2017/{filename}', 'rb') as f:
        all_dicts.append(pkl.load(f))

In [84]:
all_dicts[10]

{118873: [486, 161, {}],
 543350: [2, 0, {}],
 71044: [6250, 1712, {}],
 258544: [1714, 587, {}],
 8983: [2, 0, {}],
 103976: [14, 2, {}],
 6950: [7, 0, {}],
 28425: [30, 0, {}],
 580134: [7, 2, {}],
 507763: [3, 0, {}],
 397374: [32, 1, {}],
 546553: [5, 0, {}],
 19205: [13, 4, {}],
 278811: [13, 2, {}],
 236591: [11, 0, {}],
 551327: [5, 0, {}],
 179179: [1, 0, {}],
 285953: [7, 0, {}],
 352334: [2, 0, {}],
 539303: [7, 2, {}],
 382324: [5, 0, {}],
 410622: [1, 0, {}],
 239347: [1, 0, {}],
 266014: [2, 0, {}],
 285955: [2, 0, {}],
 239852: [2, 0, {}],
 462567: [3, 0, {}],
 396899: [1, 1, {}],
 23361: [2, 0, {}],
 211108: [2, 0, {}],
 352365: [4, 0, {}],
 8633: [6, 0, {}],
 274689: [2, 0, {}],
 23379: [2, 2, {}],
 375761: [2, 0, {}],
 512237: [1, 0, {}],
 564815: [65, 24, {}],
 47351: [47, 0, {}],
 16478: [14, 1, {}],
 116678: [75, 26, {}],
 304776: [1, 1, {}],
 165305: [2, 0, {}],
 103963: [588, 232, {}],
 251498: [4508, 25, {}],
 507759: [1, 0, {}],
 58568: [2, 0, {}],
 332218: [202

In [82]:
giant_dict = {}

In [85]:
for dict_ in tqdm(all_dicts):
    for k, v in dict_.items():
        if k not in giant_dict:
            giant_dict[k] = v
        else:
            giant_dict[k][0] += v[0]
            giant_dict[k][1] += v[1]
            
            for q, occ in v[2].items():
                if q not in giant_dict[k][2]:
                    giant_dict[k][2][q] = occ
                else:
                    giant_dict[k][2][q][0] += occ[0]
                    giant_dict[k][2][q][1] += occ[1]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1900.0), HTML(value='')))




In [86]:
giant_dict

{1585: [378639, 123669, {}],
 411660: [2050, 520, {1136: [2, 0]}],
 437971: [19292, 5193, {}],
 117626: [6158, 1256, {4455: [16, 6]}],
 332746: [2395, 93, {1983: [5, 3], 1417: [2, 0]}],
 275184: [1998, 245, {}],
 80453: [159328, 26983, {3252: [12, 0]}],
 540505: [88893, 736, {3544: [111, 0], 2311: [2, 0]}],
 26510: [291007, 6412, {3535: [28, 0], 3544: [3, 0]}],
 117446: [154856, 63425, {2339: [2, 0]}],
 374940: [30298, 3687, {}],
 418178: [1753, 221, {}],
 171303: [2972, 365, {}],
 170796: [952, 258, {}],
 307912: [1008, 82, {}],
 425862: [1690, 360, {1432: [2, 0]}],
 294454: [1774, 548, {}],
 411665: [3329, 1102, {}],
 369853: [5412, 57, {}],
 393699: [2479, 509, {3723: [2, 0]}],
 471374: [341, 65, {}],
 202965: [10925, 1938, {}],
 346016: [97326, 29598, {}],
 459635: [14804, 4230, {}],
 339920: [433, 85, {2404: [2, 2]}],
 239732: [2146, 755, {}],
 70426: [444, 188, {2404: [2, 2]}],
 510256: [24817, 2009, {}],
 119183: [3047, 645, {4935: [2, 2]}],
 243356: [76168, 989, {}],
 280640: [

In [88]:
with open('data/clicks.pkl', 'wb') as f:
    pkl.dump(giant_dict, f)