In [1]:
import os
import bz2
import pickle as pkl

import pandas as pd
import tldextract
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
files = [file for file in os.listdir('data/2017') if file.endswith('bz2')]

In [3]:
queries = {row[1]:row[0] for i, row in pd.read_csv('data/queries.tsv', sep='\t', header=None).iterrows()}

In [4]:
def process(filename):
    out = {}
    error_count = 0
    
    with bz2.open(f'data/2017/{filename}', 'rt') as f:
        for line in tqdm(f):
            try:
                query, shown, clicked, timestamps = line.split('\t')
                
                query_extracted = query.split('@')[0]

                if query_extracted in queries:
                    query_id = queries[query_extracted]
                else:
                    query_id = None

                shown = shown.split(',h')
                shown[0] = shown[0][1:]
                clicked = clicked.split(',h')
                
                if len(clicked) != 0:
                    clicked[0] = clicked[0][1:]
                    
                shown2idx = {url:i for i, url in enumerate(shown)}
                    
                timestamps = [int(timestamp) for timestamp in timestamps.strip().split(',')]
                timestamps = [int((timestamp - timestamps[0])/1000) for timestamp in timestamps]
                positions = [shown2idx[click] for click in clicked]
                
                clicked_set = set(clicked)
                click2idx = {url:i for i, url in enumerate(clicked)}

                for url in shown:
                    domain = tldextract.extract('h'+url).domain

                    if domain not in out:
                        out[domain] = [0, 0, {}, [], []]

                    out[domain][0] += 1
                    out[domain][1] += int(url in clicked_set)

                    if query_id:
                        if query_id not in out[domain][2]:
                            out[domain][2][query_id] = [0, 0, [], []]

                        out[domain][2][query_id][0] += 1
                        out[domain][2][query_id][1] += int(url in clicked)
                        
                        if url in clicked_set:
                            out[domain][2][query_id][2].append(positions[click2idx[url]])
                            out[domain][2][query_id][3].append(timestamps[click2idx[url]])
                    
                    if url in clicked_set:
                        out[domain][3].append(positions[click2idx[url]])
                        out[domain][4].append(timestamps[click2idx[url]])
                        
            except:
                error_count += 1
            
    with open(f'data/2017/{filename}_DOMAIN_TIMES.pkl', 'wb') as f:
        pkl.dump(out, f)
        
    return error_count

In [6]:
Parallel(n_jobs=8)(delayed(process)(filename) for filename in files)

[7099,
 577,
 12630,
 5123,
 15354,
 12037,
 3695,
 6388,
 23487,
 1345,
 39712,
 13013,
 64562,
 3316,
 46062,
 10004,
 41965,
 8680,
 56131,
 14709,
 6219,
 17263,
 35412,
 13350,
 3159,
 8836,
 17937,
 16680,
 438155,
 15037,
 8730,
 75938,
 10014,
 1941,
 5477,
 1588,
 17498,
 21731,
 43648,
 17391,
 175,
 12910,
 5124,
 12939,
 86969,
 1868,
 37495,
 3643,
 18926,
 21514,
 283622,
 16055,
 45562,
 7790,
 42798,
 18498,
 16126,
 7534,
 31983,
 9256,
 20277,
 7828,
 237840,
 9226,
 68463,
 6471,
 13840,
 2914,
 4541,
 504,
 78961,
 17003,
 2879,
 9208,
 15141,
 12221,
 3111,
 57748,
 599,
 3169,
 50959,
 17000,
 28956,
 421157,
 11837,
 940,
 428215,
 273,
 60723,
 36252,
 14347,
 9399,
 24650,
 3434,
 2977,
 23178,
 12621,
 104511,
 8925,
 383611,
 35996,
 26512,
 1399,
 23038,
 15919,
 36170,
 16835,
 8988,
 11330,
 4593,
 10561,
 14277,
 29747,
 1340,
 29637,
 10306,
 6748,
 43102,
 153182,
 20195,
 25513,
 20360,
 36810,
 33288,
 12587,
 86871,
 32056,
 56858,
 324660,
 96211,
 

In [7]:
giant_dict = {}

In [8]:
files = [file for file in os.listdir('data/2017') if file.endswith('_TIMES.pkl')]

for filename in tqdm(files):
    with open(f'data/2017/{filename}', 'rb') as f:
        dict_ = pkl.load(f)
        
    for k, v in dict_.items():
        if k not in giant_dict:
            giant_dict[k] = v
        else:
            giant_dict[k][0] += v[0]
            giant_dict[k][1] += v[1]

            giant_dict[k][3] += v[3]
            giant_dict[k][4] += v[4]

            for q, occ in v[2].items():
                if q not in giant_dict[k][2]:
                    giant_dict[k][2][q] = occ
                else:
                    giant_dict[k][2][q][0] += occ[0]
                    giant_dict[k][2][q][1] += occ[1]
                    giant_dict[k][2][q][2] += occ[2]
                    giant_dict[k][2][q][3] += occ[3]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1900.0), HTML(value='')))




In [None]:
giant_dict

In [None]:
with open('data/clicks_times.pkl', 'wb') as f:
    pkl.dump(giant_dict, f)

In [11]:
giant_dict['pravo']

[413384,
 72694,
 {1350: [4, 2, [3, 3], [0, 0]],
  1376: [13, 3, [7, 7, 7], [35, 35, 35]],
  4000: [2, 0, [], []],
  1436: [2, 0, [], []],
  2029: [2, 0, [], []],
  448: [10, 0, [], []],
  2178: [4, 0, [], []],
  4674: [2, 0, [], []]},
 [0,
  0,
  5,
  5,
  3,
  3,
  0,
  0,
  2,
  6,
  6,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  7,
  7,
  7,
  5,
  5,
  3,
  3,
  4,
  4,
  3,
  1,
  1,
  1,
  4,
  4,
  2,
  5,
  5,
  5,
  5,
  9,
  9,
  9,
  2,
  2,
  2,
  3,
  3,
  5,
  5,
  3,
  3,
  1,
  4,
  5,
  6,
  6,
  5,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0,
  0,
  0,
  7,
  7,
  4,
  4,
  6,
  6,
  4,
  4,
  3,
  3,
  3,
  3,
  3,
  3,
  1,
  1,
  1,
  6,
  6,
  2,
  2,
  7,
  7,
  1,
  1,
  7,
  5,
  5,
  0,
  0,
  4,
  4,
  2,
  2,
  7,
  7,
  9,
  9,
  9,
  9,
  3,
  3,
  1,
  0,
  0,
  0,
  8,
  5,
  5,
  1,
  4,
  4,
  3,
  3,
  3,
  1,
  3,
  3,
  8,
  1,
  1,
  2,
  3,
  3,
  0,
  0,
  1,
  1,
  2,
  2,
  0,
  0,
  1,
  1,
  4,
  4,
  2,
  2,

In [2]:
import pickle as pkl
with open('data/clicks_times.pkl', 'rb') as f:
    click_data_times = pkl.load(f)

In [3]:
import numpy as np
from tqdm.notebook import tqdm

In [12]:
def process_value(k, v):
    value = v
    value[3] = np.mean(value[3]) if value[3] else -1 
    
    value[4] = np.mean(value[4]) if value[4] else -1
    
    for q in value[2]:
        value[2][q][2] = np.mean(value[2][q][2]) if value[2][q][2] else -1 
        
        value[2][q][3] = np.mean(value[2][q][3]) if value[2][q][3] else -1
        
    return k, v

In [15]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=8)(delayed(process_value)(k, v) for k, v in click_data_times.items())

In [17]:
results_dict = {k: v for k, v in results}

In [18]:
len(results_dict)

1847528

In [19]:
import pickle as pkl
with open('data/clicks_times_processed.pkl', 'wb') as f:
    pkl.dump(results_dict, f)