In [1]:
import sqlite3
import pandas as pd
import re
import json
import numpy as np
import os
import pickle
from os.path import dirname, abspath, join, isfile
pd.options.display.max_colwidth = 200
pd.options.display.html.use_mathjax = False


In [2]:
SEGMENTS_QUERY = '''
    SELECT sv.site_url, sv.visit_id, se.node_id,
        se.top, se.left, se.width, se.height, se.inner_text, se.time_stamp
        FROM SEGMENTS as se LEFT JOIN site_visits as sv ON se.visit_id = sv.visit_id
        WHERE se.node_name != 'BODY' and se.inner_text GLOB '*[0-9]*';
    '''

def load_segments(crawler_name, check_cache=True, dump_pickle=True):
    assert crawler_name in ["odin", "webtap"]
    pickle_file = "%s_segments.pickle" % crawler_name
    if check_cache and isfile(pickle_file):
        print("Will load segments from pickle %s" % pickle_file)
        return pd.read_pickle(pickle_file)
    final_crawl_dir = join(dirname(dirname(os.getcwd())), 'data', 'final-crawl')
    db_path = join(final_crawl_dir, "%s.sqlite" % crawler_name)
    con = sqlite3.connect(db_path)
    segments = pd.read_sql_query(SEGMENTS_QUERY, con)
    if dump_pickle:
        segments.to_pickle(pickle_file)
    return segments

def preprocess_segments(segments):
    segments['time_stamp'] = pd.to_datetime(segments['time_stamp'])
    segments['inner_processed'] = segments['inner_text'].map(lambda x: re.sub(r'\d+', 'DPNUM', x))
    segments['inner_digits'] = segments['inner_text'].map(lambda x: re.sub(r'\D+', '', x))

In [3]:
def join_with_comma(series):
    return reduce(lambda x, y: x + "," + y, series)

def difference(series):
    # return reduce(lambda x, y: int(x)-int(y), series)
    return [int(j)-int(i) for i, j in zip(series[:-1], series[1:])]

def time_difference(series):
    # return reduce(lambda x, y: int(x)-int(y), series)
    return [(j-i).total_seconds() for i, j in zip(series[:-1], series[1:])]

TIMER_MIN_NEG_POS_UPDATE_RATIO = 5  # there should be 5X more -ve updates than +ve
from collections import Counter

def most_common(diffs):
    if not diffs: return None
    counts = Counter(diffs)
    return counts.most_common(1)[0][0]

def most_common_neg(diffs):
    if not diffs: return None
    neg_diffs = [x for x in diffs if x <0]
    if not neg_diffs: return None
    counts = Counter(neg_diffs)
    return counts.most_common(1)[0][0]

def num_most_common_neg(diffs):
    neg_mode = most_common_neg(diffs)
    if not neg_mode: return 0
    return diffs.count(neg_mode)


def is_decreasing(series):
    diffs = difference(series)
    if not diffs: return False
    #n_negs = sum([1 for diff in diffs if diff<0])  # diff==59, 59 when seconds 
    n_negs = sum([1 for diff in diffs if diff<0 and diff not in [59, 5, 9]])
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < 5: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    return float(n_negs)/n_pos > TIMER_MIN_NEG_POS_UPDATE_RATIO


def is_decreasing_relaxed(series):
    diffs = difference(series)
    if not diffs: return False
    #n_negs = sum([1 for diff in diffs if diff<0])  # diff==59, 59 when seconds 
    n_negs = sum([1 for diff in diffs if diff<0 and diff not in [59, 5, 9]])
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < 5: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    return n_negs > n_pos


def is_decreasing_mode(series):
    diffs = difference(series)
    if not diffs: return False
    if len(set(series)) < 5: return False
    # 10->09, 00->59
    n_negs = sum([1 for diff in diffs if diff<0 and diff not in [59, 5, 9]])
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < 5: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    mode = most_common(diffs)
    if mode > 0: return False
    neg_mode_cnt = num_most_common_neg(diffs)
    if neg_mode_cnt < 5: return False
    return n_negs > n_pos
    #return n_negs/n_post > TIMER_MIN_DECREASING_UPDATES_RATIO


def neg_pos_ratio(series):
    diffs = difference(series)
    if not diffs: return False
    n_negs = sum([1 for diff in diffs if diff<0])  # diff==59, 59 when seconds 
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < 5: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    return float(n_negs) / n_pos
    #return n_negs/n_post > TIMER_MIN_DECREASING_UPDATES_RATIO


def n_neg_diffs(series):
    diffs = difference(series)
    if not diffs: return 0
    return sum([1 for diff in diffs if diff<0])

def num_unique(series):
    return len(set(series))

from collections import OrderedDict

def ts_check(series):
    ts_seconds = [int(ts.timestamp()) for ts in series]
    uniq_ts_seconds = list(OrderedDict.fromkeys(ts_seconds))
    return len(uniq_ts_seconds) >= 5


## Grouping

In [4]:
def detect_timers(segments, limit=None):
    tmp = segments
    if limit:
        tmp = segments.head(limit)
    segments_grouped = tmp.\
        groupby(['visit_id', 'top', 'left', 'inner_processed'], as_index=False).\
        agg({'node_id': num_unique,
             'time_stamp': ts_check,
             'inner_digits': [is_decreasing, is_decreasing_mode, is_decreasing_relaxed], 'site_url': 'first'})
    segments_grouped.columns = segments_grouped.columns.map('_'.join)
    timers = segments_grouped[segments_grouped.inner_digits_is_decreasing &
                              segments_grouped.inner_digits_is_decreasing_mode &
                              segments_grouped.time_stamp_ts_check]
    return timers, segments_grouped

In [8]:
def dump_timer_urls(timers, crawler_name):
    pd.Series(timers.site_url_first.unique()).\
        to_csv("%s_timer_urls.csv" % crawler_name, sep='\t', index=False)
    

def get_timers(crawler_name, disable_cache=False):
    pickle_path = "%s_grouped_segments.pickle" % crawler_name
    if isfile(pickle_path) and not disable_cache:
        print("Will load grouped segments from pickle %s" % pickle_path)
        grouped_segments = pd.read_pickle(pickle_path)
        timers = grouped_segments[
            grouped_segments.inner_digits_is_decreasing &
            grouped_segments.inner_digits_is_decreasing_mode &
            grouped_segments.time_stamp_ts_check]
        return timers, grouped_segments
    else:
        segments = load_segments(crawler_name)
        preprocess_segments(segments)
        timers, grouped_segments = detect_timers(segments)
        dump_timer_urls(timers, crawler_name)
        grouped_segments.to_pickle(pickle_path)
        return timers, grouped_segments


## Run timer detection

In [9]:
odin_timers, odin_segments_grouped = get_timers("odin")


Will load segments from pickle odin_segments.pickle


In [10]:
webtap_timers, webtap_segments_grouped = get_timers("webtap")
odin_timers, odin_segments_grouped = get_timers("odin")
all_timers = pd.concat([webtap_timers, odin_timers])
all_segments_grouped = pd.concat([webtap_segments_grouped, odin_segments_grouped])

Will load grouped segments from pickle webtap_grouped_segments.pickle
Will load grouped segments from pickle odin_grouped_segments.pickle


In [18]:
len(set(list(odin_timers.site_url_first.unique()) + list(webtap_timers.site_url_first.unique())))

1618

In [24]:
all_timers.site_url_first.nunique()
len(all_timers.drop_duplicates('site_url_first'))

1618

In [27]:
all_timers.drop_duplicates('site_url_first').sort_values(
    'site_url_first')[['visit_id_', 'top_', 'left_', 'site_url_first']].to_csv(
    "timer_coords.csv", sep='\t', index=False, header=False)

### Pickle grouped segments for verification

In [35]:
import numpy


In [22]:
a = ['b', 'a']
a.sort()
a
pd.Series(all_urls)

NameError: name 'all_urls' is not defined

In [23]:
all_urls = list(all_timers.site_url_first.unique())
all_urls.sort()
pd.Series(all_urls).to_csv("timer_urls.csv", index=False)

In [46]:
URL_CSV_CNT=8
for num, urls in enumerate(numpy.array_split(all_urls, URL_CSV_CNT)):
    pd.Series(urls).to_csv("timer_urls_%d.csv" % (num+1), sep='\t', index=False)

In [7]:
! wc -l  timer_urls*.csv
! wc -l  *.pickle

   886 timer_urls.csv
   203 timer_urls_1.csv
   203 timer_urls_2.csv
   202 timer_urls_3.csv
   202 timer_urls_4.csv
   202 timer_urls_5.csv
   202 timer_urls_6.csv
   202 timer_urls_7.csv
   202 timer_urls_8.csv
  2504 total
   1596553 odin_grouped_segments.pickle
  11612039 odin_segments.pickle
   1459610 webtap_grouped_segments.pickle
  12456395 webtap_segments.pickle
  27124597 total


In [None]:
def dump_timer_urls(webtap_timers, odin_timers):
    
    pd.Series(timers.site_url_first.unique()).\
        to_csv("%s_timer_urls.csv" % crawler_name, sep='\t', index=False)
    


In [51]:
! ls -lh *.pickle

-rw-rw-r-- 1 gacar gacar 272M Feb 25 18:21 odin_grouped_segments.pickle
-rw-rw-r-- 1 gacar gacar 1.1G Feb 25 16:40 odin_segments.pickle
-rw-rw-r-- 1 gacar gacar 272M Feb 25 18:21 webtap_grouped_segments.pickle
-rw-rw-r-- 1 gacar gacar 870M Feb 25 16:33 webtap_segments.pickle


In [7]:
webtap_timers.inner_processed_.head()

Series([], Name: inner_processed_, dtype: object)

In [7]:
## The effect of different approaches
print segments_grouped[segments_grouped.inner_digits_is_decreasing].visit_id_.nunique()
print segments_grouped[segments_grouped.inner_digits_is_decreasing & segments_grouped.inner_digits_is_decreasing_mode].visit_id_.nunique()
print timers.visit_id_.nunique()

888
888
886


In [None]:
# Where different methods disagree
segments_grouped[~segments_grouped.inner_digits_is_decreasing & segments_grouped.inner_digits_is_decreasing_relaxed]

In [21]:
segments_grouped[~segments_grouped.inner_digits_is_decreasing & segments_grouped.inner_digits_is_decreasing_mode]

Unnamed: 0,visit_id_,top_,left_,inner_processed_,inner_text_list,inner_text_num_unique,time_stamp_avg_delta,time_stamp_ts_check,time_stamp_std_dev,time_stamp_uniq_ts_seconds,...,site_url_first,inner_digits_len,inner_digits_neg_pos_ratio,inner_digits_n_neg_diffs,inner_digits_list,inner_digits_is_decreasing,inner_digits_is_decreasing_mode,inner_digits_is_decreasing_relaxed,inner_digits_difference,digits_n_unique
12891,243,608,845,DPNUM\nDPNUM\nHours\n\t\nDPNUM\nDPNUM\nMinutes\n\t\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nSeconds\n\t\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM/DPNUM s,"[0\n1\nHours\n\t\n4\n6\nMinutes\n\t\n3\n2\n0\n9\nSeconds\n\t\n3\n0\n4\n0\n1/10 s, 0\n1\nHours\n\t\n4\n6\nMinutes\n\t\n3\n2\n0\n9\nSeconds\n\t\n0\n8\n4\n0\n1/10 s, 0\n1\nHours\n\t\n4\n6\nMinutes\n\...",562,1.000000,True,0.190557,"[1549507249, 1549507250, 1549507251, 1549507252, 1549507253, 1549507254, 1549507255, 1549507256, 1549507257, 1549507258, 1549507259, 1549507260, 1549507261, 1549507262, 1549507263, 1549507264, 154...",...,https://rosymerry.com/collections/hot-dress-best-selling/products/zipper-cashmere-solid-sweet-long-sleeve-hoodie-teddy-bear-coats,2319,2.14381,641,"[014632093040110, 014632090840110, 014632090840110, 014632090840110, 014632098640110, 014632098606110, 014632098606110, 014632098606110, 014632098606110, 014632096306110, 014632096362110, 01463209...",False,True,True,"[-2200000, 0, 0, 7800000, -34000, 0, 0, 0, -2300000, 56000, 0, 0, 0, -3200000, -34000, 0, 0, 0, 56000, 0, 0, -1300000, 6800000, 0, -44000, 0, 0, -2200000, -34000, 0, 0, 0, 56000, 0, 0, -2300000, 0...",562
12892,243,608,845,DPNUM\nDPNUM\nHours\n\t\nDPNUM\nDPNUM\nMinutes\n\t\nDPNUM\nDPNUM\nDPNUM\nSeconds\n\t\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM/DPNUM s,"[0\n1\nHours\n\t\n4\n6\nMinutes\n\t\n3\n8\n7\nSeconds\n\t\n0\n7\n0\n6\n1/10 s, 0\n1\nHours\n\t\n4\n6\nMinutes\n\t\n3\n8\n7\nSeconds\n\t\n0\n7\n0\n6\n1/10 s, 0\n1\nHours\n\t\n4\n6\nMinutes\n\t\n3\n...",426,0.500000,True,0.000000,"[1549507211, 1549507212, 1549507213, 1549507214, 1549507215, 1549507216, 1549507217, 1549507218, 1549507219, 1549507220, 1549507221, 1549507222, 1549507223, 1549507224, 1549507225, 1549507226, 154...",...,https://rosymerry.com/collections/hot-dress-best-selling/products/zipper-cashmere-solid-sweet-long-sleeve-hoodie-teddy-bear-coats,1659,2.14953,460,"[01463870706110, 01463870706110, 01463870762110, 01463870762110, 01463870762110, 01463877562110, 01463877562110, 01463877528110, 01463877528110, 01463877528110, 01463875228110, 01463875228110, 014...",False,True,True,"[0, 56000, 0, 0, 6800000, 0, -34000, 0, 0, -2300000, 0, 56000, 0, -3200000, 0, 0, -44000, 0, 0, -1200000, 0, -34000, 0, 0, 7700000, 0, 56000, 0, 0, -3200000, 0, -34000, 0, 0, -2300000, 0, 56000, 0...",426
86349,1668,608,845,DPNUM\nDPNUM\nHours\n\t\nDPNUM\nDPNUM\nMinutes\n\t\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nSeconds\n\t\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM/DPNUM s,"[0\n2\nHours\n\t\n0\n1\nMinutes\n\t\n2\n1\n0\n9\nSeconds\n\t\n0\n7\n0\n6\n1/10 s, 0\n2\nHours\n\t\n0\n1\nMinutes\n\t\n2\n1\n0\n9\nSeconds\n\t\n0\n7\n0\n6\n1/10 s, 0\n2\nHours\n\t\n0\n1\nMinutes\n\...",788,1.000000,True,0.137335,"[1549525051, 1549525052, 1549525054, 1549525055, 1549525056, 1549525057, 1549525058, 1549525059, 1549525060, 1549525061, 1549525062, 1549525063, 1549525064, 1549525065, 1549525066, 1549525067, 154...",...,https://rosymerry.com/collections/dresses/products/casual-solid-v-neck-long-sleeve-maxi-dress,3219,2.13494,886,"[020121090706110, 020121090706110, 020121090706110, 020121090762110, 020121090762110, 020121090762110, 020121097562110, 020121097562110, 020121097528110, 020121097528110, 020121097528110, 02012109...",False,True,True,"[0, 0, 56000, 0, 0, 6800000, 0, -34000, 0, 0, -2300000, 0, 56000, 0, 0, -3200000, 0, -44000, 0, 0, -1200000, 0, -34000, 0, 0, 7700000, 0, 56000, 0, 0, -3200000, 0, -34000, 0, 0, -2300000, 0, 56000...",788
120484,2388,687,1243,DPNUM,"[9, 0, 9, 9, 9, 9, 0, 9, 0, 9, 8, 8, 8, 8, 9, 8, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 7, 6, 5, 5, 5, 5, 6, 5, 4, 4, 4, 4, 5, 4, 3, 3, 3, 3, 4, 3, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 2, 1, 0, 0, 0, 0, 1, 0, 9, ...",10,1.333333,True,0.242061,"[1549533606, 1549533607, 1549533608, 1549533610, 1549533611, 1549533612, 1549533613, 1549533614, 1549533615, 1549533616, 1549533617, 1549533618, 1549533619, 1549533620, 1549533621, 1549533622, 154...",...,https://swagtron.com/product/swagtron-t380-hoverboard-bluetooth-speaker-lights-personalize-experience-w-android-ios-app/,106,1.7,34,"[9, 0, 9, 9, 9, 9, 0, 9, 0, 9, 8, 8, 8, 8, 9, 8, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 7, 6, 5, 5, 5, 5, 6, 5, 4, 4, 4, 4, 5, 4, 3, 3, 3, 3, 4, 3, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 2, 1, 0, 0, 0, 0, 1, 0, 9, ...",False,True,True,"[-9, 9, 0, 0, 0, -9, 9, -9, 9, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 1, -1, -1, 0...",10
154008,3047,193,531,DPNUM,"[7, 7, 8, 8, 7, 7, 7, 7, 8, 8, 7, 8, 8, 6, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1]",6,8.400000,True,10.965400,"[1549541414, 1549541418, 1549541448, 1549541449, 1549541450, 1549541456]",...,https://sexyrealsexdolls.com/collections/featured-sex-dolls/products/156cm-5ft1-b-cup-ssbbw-love-doll-with-huge-butt-and-small-tits-autumn,61,1.4,7,"[7, 7, 8, 8, 7, 7, 7, 7, 8, 8, 7, 8, 8, 6, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1]",False,True,True,"[0, 1, 0, -1, 0, 0, 0, 1, 0, -1, 1, 0, -2, -1, 0, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 1, 0, -1, 1, 0]",6
204542,4048,1470,469,"$DPNUM,DPNUM","[$1,200, $1,200, $3,699, $6,399, $3,600, $2,400, $1,200, $3,699, $6,399, $3,600, $2,400, $1,200, $3,699, $6,399, $3,600, $2,400, $1,200, $3,699, $6,399]",5,3.000000,True,0.687184,"[1549553459, 1549553460, 1549553464, 1549553468, 1549553472, 1549553476, 1549553480, 1549553484, 1549553488, 1549553492, 1549553496, 1549553500, 1549553504, 1549553508, 1549553512, 1549553516, 154...",...,https://www.michaelhill.com.au/ring-with-4-carat-tw-of-diamonds-in-10ct-white-gold-15850713.html?cgid=simply-irresistible,19,1.125,9,"[1200, 1200, 3699, 6399, 3600, 2400, 1200, 3699, 6399, 3600, 2400, 1200, 3699, 6399, 3600, 2400, 1200, 3699, 6399]",False,True,True,"[0, 2499, 2700, -2799, -1200, -1200, 2499, 2700, -2799, -1200, -1200, 2499, 2700, -2799, -1200, -1200, 2499, 2700]",5
280335,5484,1380,277,$DPNUM.DPNUM,"[$19.99, $19.99, $19.99, $29.99, $29.99, $29.99, $29.99, $29.99, $29.99, $24.99, $24.99, $24.99, $14.99, $14.99, $24.99, $24.99, $14.99, $14.99, $39.99, $39.99, $39.99, $39.99, $39.99, $34.99, $34...",7,6.111111,True,2.366378,"[1549572062, 1549572065, 1549572069, 1549572073, 1549572075, 1549572078, 1549572079, 1549572083, 1549572087, 1549572088, 1549572092, 1549572093, 1549572097, 1549572098, 1549572103, 1549572104, 154...",...,https://www.bouclair.com/en/marketplace/shop-all/decorative-pillows/inhale-decorative-lumbar-pillow-11-x-21-9396431M.html,82,1.16667,14,"[1999, 1999, 1999, 2999, 2999, 2999, 2999, 2999, 2999, 2499, 2499, 2499, 1499, 1499, 2499, 2499, 1499, 1499, 3999, 3999, 3999, 3999, 3999, 3499, 3499, 1999, 1999, 1999, 3499, 3499, 3499, 1299, 129...",False,True,True,"[0, 0, 1000, 0, 0, 0, 0, 0, -500, 0, 0, -1000, 0, 1000, 0, -1000, 0, 2500, 0, 0, 0, 0, -500, 0, -1500, 0, 0, 1500, 0, 0, -2200, 0, 0, 1700, 0, 0, -1700, 0, 0, 1700, 0, 0, -1000, 0, 0, 500, 0, -500...",7
280353,5484,1380,467,$DPNUM.DPNUM,"[$29.99, $29.99, $29.99, $29.99, $29.99, $29.99, $24.99, $24.99, $24.99, $14.99, $14.99, $14.99, $24.99, $24.99, $14.99, $14.99, $39.99, $39.99, $39.99, $39.99, $39.99, $34.99, $34.99, $19.99, $19...",7,5.555556,True,2.183132,"[1549572062, 1549572065, 1549572069, 1549572073, 1549572075, 1549572078, 1549572079, 1549572083, 1549572087, 1549572088, 1549572092, 1549572093, 1549572097, 1549572098, 1549572103, 1549572104, 154...",...,https://www.bouclair.com/en/marketplace/shop-all/decorative-pillows/inhale-decorative-lumbar-pillow-11-x-21-9396431M.html,80,1.36364,15,"[2999, 2999, 2999, 2999, 2999, 2999, 2499, 2499, 2499, 1499, 1499, 1499, 2499, 2499, 1499, 1499, 3999, 3999, 3999, 3999, 3999, 3499, 3499, 1999, 1999, 3499, 3499, 3499, 1299, 1299, 1299, 2999, 299...",False,True,True,"[0, 0, 0, 0, 0, -500, 0, 0, -1000, 0, 0, 1000, 0, -1000, 0, 2500, 0, 0, 0, 0, -500, 0, -1500, 0, 1500, 0, 0, -2200, 0, 0, 1700, 0, 0, -1700, 0, 0, 1700, 0, 0, -1000, 0, 0, 500, 0, 0, -500, 0, -700...",7
280370,5484,1380,657,$DPNUM.DPNUM,"[$29.99, $29.99, $24.99, $24.99, $14.99, $14.99, $24.99, $24.99, $14.99, $39.99, $39.99, $34.99, $34.99, $19.99, $34.99, $12.99, $12.99, $29.99, $29.99, $12.99, $12.99, $29.99, $29.99, $19.99, $19...",7,6.111111,True,2.045912,"[1549572062, 1549572065, 1549572069, 1549572073, 1549572075, 1549572078, 1549572083, 1549572087, 1549572092, 1549572097, 1549572103, 1549572105, 1549572110, 1549572116, 1549572122, 1549572124, 154...",...,https://www.bouclair.com/en/marketplace/shop-all/decorative-pillows/inhale-decorative-lumbar-pillow-11-x-21-9396431M.html,50,1.36364,15,"[2999, 2999, 2499, 2499, 1499, 1499, 2499, 2499, 1499, 3999, 3999, 3499, 3499, 1999, 3499, 1299, 1299, 2999, 2999, 1299, 1299, 2999, 2999, 1999, 1999, 2499, 2499, 1999, 1999, 1299, 1299, 1299, 129...",False,True,True,"[0, -500, 0, -1000, 0, 1000, 0, -1000, 2500, 0, -500, 0, -1500, 1500, -2200, 0, 1700, 0, -1700, 0, 1700, 0, -1000, 0, 500, 0, -500, 0, -700, 0, 0, 0, 0, 1200, 1500, -2700, 2200, -500, 0, -1700, 12...",7
280387,5484,1380,847,$DPNUM.DPNUM,"[$24.99, $24.99, $14.99, $14.99, $24.99, $24.99, $14.99, $14.99, $39.99, $39.99, $34.99, $19.99, $19.99, $34.99, $12.99, $29.99, $29.99, $12.99, $12.99, $29.99, $29.99, $19.99, $19.99, $24.99, $24...",7,6.555556,True,2.268443,"[1549572062, 1549572065, 1549572069, 1549572073, 1549572075, 1549572078, 1549572083, 1549572087, 1549572092, 1549572097, 1549572103, 1549572105, 1549572110, 1549572116, 1549572122, 1549572124, 154...",...,https://www.bouclair.com/en/marketplace/shop-all/decorative-pillows/inhale-decorative-lumbar-pillow-11-x-21-9396431M.html,49,1.16667,14,"[2499, 2499, 1499, 1499, 2499, 2499, 1499, 1499, 3999, 3999, 3499, 1999, 1999, 3499, 1299, 2999, 2999, 1299, 1299, 2999, 2999, 1999, 1999, 2499, 2499, 1999, 1999, 1299, 1299, 1299, 1299, 2499, 249...",False,True,True,"[0, -1000, 0, 1000, 0, -1000, 0, 2500, 0, -500, -1500, 0, 1500, -2200, 1700, 0, -1700, 0, 1700, 0, -1000, 0, 500, 0, -500, 0, -700, 0, 0, 0, 1200, 0, 1500, -2700, 2200, -500, -1700, 0, 1200, 1000,...",7


In [18]:
timers.node_id_num_unique.value_counts().head()

1    1070
2     114
3      50
4      25
5      24
Name: node_id_num_unique, dtype: int64

In [16]:
timers.head()

Unnamed: 0,visit_id_,top_,left_,inner_processed_,inner_text_list,inner_text_num_unique,time_stamp_avg_delta,time_stamp_ts_check,time_stamp_std_dev,time_stamp_uniq_ts_seconds,...,site_url_first,inner_digits_len,inner_digits_neg_pos_ratio,inner_digits_n_neg_diffs,inner_digits_list,inner_digits_is_decreasing,inner_digits_is_decreasing_mode,inner_digits_is_decreasing_relaxed,inner_digits_difference,digits_n_unique
105,3,172,859,DPNUM Hours DPNUM Mins DPNUM Secs,"[06 Hours 09 Mins 30 Secs, 06 Hours 09 Mins 29 Secs, 06 Hours 09 Mins 28 Secs, 06 Hours 09 Mins 27 Secs, 06 Hours 09 Mins 26 Secs, 06 Hours 09 Mins 25 Secs, 06 Hours 09 Mins 24 Secs, 06 Hours 09 M...",45,1.666667,True,0.897686,"[1549504231, 1549504232, 1549504233, 1549504234, 1549504235, 1549504236, 1549504237, 1549504238, 1549504239, 1549504240, 1549504241, 1549504242, 1549504243, 1549504244, 1549504245, 1549504246, 154...",...,https://www.rugs-direct.com/Details/OrientalWeavers-Atlas-8037/130600/209717,45,True,44,"[060930, 060929, 060928, 060927, 060926, 060925, 060924, 060923, 060922, 060921, 060920, 060919, 060918, 060917, 060916, 060915, 060914, 060913, 060909, 060908, 060907, 060903, 060902, 060858, 060...",True,True,True,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -4, -1, -1, -4, -1, -44, -1, -1, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",45
1781,30,293,0,DPNUM\nDPNUM\nHOURS\n:\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nMINUTES\n:\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nDPNUM\nSECONDS,"[0\n0\nHOURS\n:\n5\n4\n4\n5\n0\n9\n9\n0\nMINUTES\n:\n0\n5\n5\n0\n8\n7\n7\n8\nSECONDS, 0\n0\nHOURS\n:\n5\n4\n4\n5\n0\n9\n9\n0\nMINUTES\n:\n0\n5\n5\n0\n8\n7\n7\n8\nSECONDS, 0\n0\nHOURS\n:\n5\n4\n4\n...",156,0.5,True,0.0,"[1549504567, 1549504568, 1549504569, 1549504570, 1549504571, 1549504572, 1549504573, 1549504574, 1549504575, 1549504576, 1549504577, 1549504578, 1549504579, 1549504580, 1549504581, 1549504582, 154...",...,http://www.noahscave.com/product/360-degree-rotation-flexible-phone-selfie-holder-snake-like-neck-bed-mount-anti-skid-iphone-android-phones/,489,7.84211,149,"[005445099005508778, 005445099005508778, 005445099005508778, 005445099005507667, 005445099005507667, 005445099005507667, 005445099005506556, 005445099005506556, 005445099005506556, 005445099005505...",True,True,True,"[0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -11, 0, 0, 8899, 48950000, 0, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, 0, -1111, 0, ...",156
4887,81,763,707,DPNUMmins DPNUMsecs,"[14mins 59secs, 14mins 58secs, 14mins 57secs, 14mins 56secs, 14mins 55secs, 14mins 54secs]",6,1.0,True,0.0,"[1549505294, 1549505295, 1549505296, 1549505297, 1549505298, 1549505299]",...,https://www.theproteinworks.com/diet-meal-replacement-extreme,6,True,5,"[1459, 1458, 1457, 1456, 1455, 1454]",True,True,True,"[-1, -1, -1, -1, -1]",6
4899,81,949,239,WANT IT TOMORROW DPNUM FEB?\n\nOrder within DPNUMh DPNUMm DPNUMs and it'll leave us today!,"[WANT IT TOMORROW 8 FEB?\n\nOrder within 14h 53m 10s and it'll leave us today!, WANT IT TOMORROW 8 FEB?\n\nOrder within 14h 53m 8s and it'll leave us today!, WANT IT TOMORROW 8 FEB?\n\nOrder wit...",89,1.0,True,0.0,"[1549505211, 1549505212, 1549505213, 1549505214, 1549505215, 1549505216, 1549505217, 1549505218, 1549505219, 1549505220, 1549505221, 1549505222, 1549505223, 1549505224, 1549505225, 1549505226, 154...",...,https://www.theproteinworks.com/diet-meal-replacement-extreme,89,43,86,"[8145310, 814538, 814537, 814536, 814535, 814534, 814533, 814532, 814531, 814530, 8145259, 8145258, 8145257, 8145256, 8145255, 8145254, 8145253, 8145252, 8145251, 8145250, 8145249, 8145248, 814524...",True,True,True,"[-7330772, -1, -1, -1, -1, -1, -1, -1, -1, 7330729, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",89
9074,169,440,942,DPNUM:DPNUM:DPNUM,"[20:34:34, 20:34:33, 20:34:32, 20:34:31, 20:34:30, 20:34:29, 20:34:28, 20:34:27, 20:34:26, 20:34:25, 20:34:24, 20:34:23, 20:34:22, 20:34:21, 20:34:20, 20:34:19, 20:34:18, 20:34:17, 20:34:16, 20:34...",28,1.0,True,0.0,"[1549506358, 1549506359, 1549506360, 1549506361, 1549506362, 1549506363, 1549506364, 1549506365, 1549506366, 1549506367, 1549506368, 1549506369, 1549506370, 1549506371, 1549506372, 1549506373, 154...",...,https://www.1-day.co.nz/products/smart-powerbank-10000mah-w-dual-usb,28,True,27,"[203434, 203433, 203432, 203431, 203430, 203429, 203428, 203427, 203426, 203425, 203424, 203423, 203422, 203421, 203420, 203419, 203418, 203417, 203416, 203415, 203414, 203413, 203412, 203411, 203...",True,True,True,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",28
