In [23]:
import os
import bz2
import pickle as pkl
import numpy as np
import pandas as pd
import tldextract
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [24]:
files = [file for file in os.listdir('data/2017') if file.endswith('bz2')]

In [25]:
queries = {row[1]:row[0] for i, row in pd.read_csv('data/queries.tsv', sep='\t', header=None).iterrows()}

In [30]:
def process(filename):
    out = {}
    error_count = 0
    
    with bz2.open(f'data/2017/{filename}', 'rt') as f:
        for line in tqdm(f):
            try:
                query, shown, clicked, timestamps = line.split('\t')
                
                query_extracted = query.split('@')[0]

                if query_extracted in queries:
                    query_id = queries[query_extracted]                    
                    timestamps = [int(timestamp) for timestamp in timestamps.strip().split(',')]
                    timestamps = [int((timestamp - timestamps[0])/1000) for timestamp in timestamps]
                    mean_time = sum(timestamps)/len(timestamps)
                    max_time = max(timestamps)
                    
                    if query_id not in out:
                        out[query_id] = [1, [mean_time], [max_time], [len(clicked.split(','))]]
                    else:
                        out[query_id][0] += 1
                        out[query_id][1].append(mean_time)
                        out[query_id][2].append(max_time)
                        out[query_id][3].append(len(clicked.split(',')))
                        
            except:
                error_count += 1
                
    for q in out:
        out[q] = [out[q][0], np.mean(out[q][1]), np.mean(out[q][2]), np.mean(out[q][3])]
            
    with open(f'data/2017/{filename}_Qs.pkl', 'wb') as f:
        pkl.dump(out, f)
        
    return error_count

In [29]:
process(files[0])

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




{5482: [53, 70.03975741239893, 136.35849056603774, 3.69811320754717]}

In [31]:
Parallel(n_jobs=8)(delayed(process)(filename) for filename in files)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [40]:
giant_dict = {}

In [41]:
files = [file for file in os.listdir('data/2017') if file.endswith('_Qs.pkl')]

for filename in tqdm(files):
    with open(f'data/2017/{filename}', 'rb') as f:
        dict_ = pkl.load(f)
        
    for k, v in dict_.items():
        if k not in giant_dict:
            giant_dict[k] = [v[0], v[1], v[2], v[3]]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1900.0), HTML(value='')))




In [42]:
giant_dict

{1916: [42, 23.07936507936508, 47.04761904761905, 1.2857142857142858],
 1926: [2, 0.0, 0.0, 1.0],
 1925: [2, 0.0, 0.0, 1.0],
 1930: [2, 36.0, 72.0, 2.0],
 1928: [2, 1473.3333333333333, 2286.0, 3.0],
 1931: [3, 0.0, 0.0, 1.0],
 1906: [76883, 129.94820789845795, 258.3957181691661, 1.132786181600614],
 1903: [16029, 64.07557032046083, 126.50901491047476, 1.0772973984652816],
 1907: [523, 134.55736137667304, 269.10133843212236, 1.1644359464627152],
 1909: [2047, 67.2041914363116, 136.9369809477284, 1.5578895945285784],
 1935: [2, 0.0, 0.0, 1.0],
 1932: [2, 0.0, 0.0, 1.0],
 5511: [55, 90.90181818181817, 188.8181818181818, 2.018181818181818],
 5390: [2, 7.0, 14.0, 2.0],
 5396: [2, 42.666666666666664, 87.0, 3.0],
 5395: [1, 0.0, 0.0, 1.0],
 5432: [3, 84.5, 169.0, 2.0],
 5426: [2, 0.0, 0.0, 1.0],
 5433: [2, 155.5, 311.0, 2.0],
 5427: [19, 1437.136842105263, 2182.7368421052633, 2.263157894736842],
 5428: [2, 0.0, 0.0, 1.0],
 5429: [742, 14.207929020664869, 25.9177897574124, 1.3409703504043127],

In [43]:
with open('data/query_stats.pkl', 'wb') as f:
    pkl.dump(giant_dict, f)

In [11]:
giant_dict['pravo']

[413384,
 72694,
 {1350: [4, 2, [3, 3], [0, 0]],
  1376: [13, 3, [7, 7, 7], [35, 35, 35]],
  4000: [2, 0, [], []],
  1436: [2, 0, [], []],
  2029: [2, 0, [], []],
  448: [10, 0, [], []],
  2178: [4, 0, [], []],
  4674: [2, 0, [], []]},
 [0,
  0,
  5,
  5,
  3,
  3,
  0,
  0,
  2,
  6,
  6,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  7,
  7,
  7,
  5,
  5,
  3,
  3,
  4,
  4,
  3,
  1,
  1,
  1,
  4,
  4,
  2,
  5,
  5,
  5,
  5,
  9,
  9,
  9,
  2,
  2,
  2,
  3,
  3,
  5,
  5,
  3,
  3,
  1,
  4,
  5,
  6,
  6,
  5,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0,
  0,
  0,
  7,
  7,
  4,
  4,
  6,
  6,
  4,
  4,
  3,
  3,
  3,
  3,
  3,
  3,
  1,
  1,
  1,
  6,
  6,
  2,
  2,
  7,
  7,
  1,
  1,
  7,
  5,
  5,
  0,
  0,
  4,
  4,
  2,
  2,
  7,
  7,
  9,
  9,
  9,
  9,
  3,
  3,
  1,
  0,
  0,
  0,
  8,
  5,
  5,
  1,
  4,
  4,
  3,
  3,
  3,
  1,
  3,
  3,
  8,
  1,
  1,
  2,
  3,
  3,
  0,
  0,
  1,
  1,
  2,
  2,
  0,
  0,
  1,
  1,
  4,
  4,
  2,
  2,

In [2]:
import pickle as pkl
with open('data/clicks_times.pkl', 'rb') as f:
    click_data_times = pkl.load(f)

In [3]:
import numpy as np
from tqdm.notebook import tqdm

In [12]:
def process_value(k, v):
    value = v
    value[3] = np.mean(value[3]) if value[3] else -1 
    
    value[4] = np.mean(value[4]) if value[4] else -1
    
    for q in value[2]:
        value[2][q][2] = np.mean(value[2][q][2]) if value[2][q][2] else -1 
        
        value[2][q][3] = np.mean(value[2][q][3]) if value[2][q][3] else -1
        
    return k, v

In [15]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=8)(delayed(process_value)(k, v) for k, v in click_data_times.items())

In [17]:
results_dict = {k: v for k, v in results}

In [18]:
len(results_dict)

1847528

In [19]:
import pickle as pkl
with open('data/clicks_times_processed.pkl', 'wb') as f:
    pkl.dump(results_dict, f)