In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from urllib.parse import urlparse,parse_qs
from os.path import splitext, basename, dirname
from urllib.parse import urlparse,parse_qs
from operator import methodcaller as call

In [2]:
def load_dataframe():
    #read panel dataset and drop beacons without cookies
    panel_df=pd.read_json("data/panel_dataset.json.gz", convert_dates=["date"], compression="gzip", lines=True)
    panel_df.loc[panel_df["actor_cookie"]=="NA","actor_cookie"]=""
    panel_df = panel_df[panel_df.actor_cookie != ""]

    #split panel into sub fields
    panel_df["panel_prot"]=panel_df["panel_url"].apply(urlparse).apply(lambda x: x[0])
    panel_df["panel_path"]=panel_df["panel_url"].apply(urlparse).apply(lambda x: (dirname(x[2]) if (splitext(x[2])[1] != '') else x[2]))
    panel_df["panel_ext"]=panel_df["panel_url"].apply(urlparse).apply(lambda x: (basename(x[2]) if (splitext(x[2])[1] != '') else ""))
    panel_df["panel_id"]=panel_df["panel_domain"].str.lstrip("www.")+panel_df["panel_path"].str.rstrip("/").str.replace("bots","").str.replace("dashboard","") # trim trailing /

    #load UA subfields
    uadf=pd.read_json("data/parsed_user_agents.json.gz", compression="gzip", lines=True)
    panel_df=panel_df.merge(uadf,how="left",on="actor_ua").fillna("")
    
    return panel_df

In [3]:
def parse_ver(ver=""):  
    """ Function to parse browser version string into integer for comparision
        :return: an integer representation of browser version. 
    """
    
    ver_numeric=[]
    for ver_i in ver.split("."):
        if ver_i.isnumeric():
            ver_numeric.append(int(ver_i))
        else:
            ver_numeric.append(ver_i)
    return ver_numeric

def merge_panel_cookies(dataframe):
    """ Function to merge cookies into distinct device IDs.
        :dataframe: Pandas dataframe containing panel beacons with columns:[date,cookie,user_agent_operating_system,user_agent_browser,user_agent_browser_version]
        :return: a dictionary of merged cookies (device ids) and their respective merged cookies.
    """
    
    # Initialize variables
    df=dataframe.copy()
    cluster_id=0
    merged_cc={}
    cluster_group={}
    collision=[]
    all_cookies = set(df.actor_cookie.unique())
    merged_cookies = set()
    processed_cookie_clusters = set()
    device_id=0
    merged_devices = {}
    
    # Create graph for calculated connected components of panels and cookies
    df.loc[df.actor_cookie!="","vtheta_label"]="o>"+df[df.actor_cookie!=""].actor_cookie
    df.loc[df.actor_cookie!="","vpi_label"]="p>"+df[df.actor_cookie!=""].panel_id
    c2pG = nx.Graph()
    c2pG.add_nodes_from(df.loc[df.actor_cookie!="","vtheta_label"].unique())
    c2pG.add_nodes_from(df.loc[df.actor_cookie!="","vpi_label"].unique())
    c2pG.add_edges_from(df.loc[df.actor_cookie!="",["vtheta_label","vpi_label"]].drop_duplicates().values)

    # merging algorithm according to A. Dasgupta :“Overcoming browser cookie churn with clustering”
    for c in sorted(nx.connected_components(c2pG), key=len, reverse=True)[:]:
        # For each connected component of panels and cookies looks for potential cookies that can be merged

        cc=c2pG.subgraph(list(c)).copy()
        op=set(filter(call('startswith', 'o>'),filter(lambda x: x == x , c)))
        pn=set(filter(call('startswith', 'p>'),c))

        #Skip components with only one cookie
        if len(op)>1:
            panels=list(map( lambda x: x[2:],pn)) # get list of panel ids and remove the p> chars
            df_l1=df[(df.actor_cookie!="") & ((df.browser!="") | (df.os!="")) & (df.panel_path!="/")].loc[df.panel_id.isin(panels),["date","actor_cookie","os","browser","browser_ver"]]

            # For each cookie in component calculate aggregate stats
            if len(df_l1) != 0:
                df_l1=df_l1.groupby(
                    "actor_cookie",as_index=False).agg(
                    lifespan=("date",lambda x: pd.Interval(x.min(),x.max(),closed="neither")),
                    os_count=("os", "nunique"),
                    bw_count=("browser", "nunique"),
                    bwv_count=("browser_ver", "nunique"),
                    os=("os",lambda x: x.values.all()),
                    browser=("browser",lambda x: x.values.all()),
                    browser_ver=("browser_ver",lambda x: x.values.all())).sort_values("lifespan")

        cluster_list=[]
        merged_count=0
        m=0
        for name, group in df_l1.groupby(["os","browser"]):
            #Group by device components(operating system and browser) and look for potential merging candidates
            
            collision_count=0
            cluster_key="{}_{}".format(name[0].strip().lower(),name[1].strip().lower().replace(" ","_"))
            clusters={} # initialize cluster
            merged_count+=m
            m=0
            merged_j=[] # track all the j_th elements that have been merged
            for i in range(0,len(group.index)):
                actor_i_cookie=group.iloc[i].actor_cookie
                actor_i_bwv=group.iloc[i].browser_ver
                actor_i_interval=group.iloc[i].lifespan
                actor_i_panels=df[df.actor_cookie==group.iloc[i].actor_cookie].panel_id.unique().tolist()
                skip_merge=False # already in a cluster, skip merging

                # check if already merged
                for k,v in clusters.items():
                    if actor_i_cookie in v.keys():
                        skip_merge=True
                        break

                if skip_merge:
                    continue

                clusters[m]={actor_i_cookie:{
                    #"cookie":actor_i_cookie,
                    "interval":actor_i_interval,
                    "panels":actor_i_panels,
                    "browser_ver":actor_i_bwv}}
                m+=1 # latest cluster id


                for j in range(i+1,len(group.index)):
                    # find the first non-overlaping cluster to merge cookie
                    cluster_found=False
                    actor_j_cookie=group.iloc[j].actor_cookie
                    actor_j_bwv=group.iloc[j].browser_ver
                    actor_j_interval=group.iloc[j].lifespan
                    actor_j_panels=df[df.actor_cookie==group.iloc[j].actor_cookie].panel_id.unique().tolist()
                    for k,v in clusters.items(): # iterate through clusters

                        # create interval to check
                        interval_list=[]
                        panel_list=[]
                        lifespan_list=[]
                        browser_ver_list=[]

                        # has cookie_j already been merged?
                        #if actor_j_cookie in v.keys():
                        #    cluster_found=True

                        for k2,v2 in v.items(): # iterate through cookies
                            for k3,v3 in v2.items(): # iterate through interval/panels
                                if k3=="interval":
                                    interval_list.append(v3)
                                    lifespan_list.append(v3.length.seconds)
                                elif k3=="panels":
                                    panel_list.append(v3)
                                elif k3=="browser_ver":
                                    browser_ver_list.append(v3)

                        panel_list=set(panel_list[0])

                        cookie_interval=pd.arrays.IntervalArray(interval_list)

                        # compare version number make sure j is >= i
                        ver_j_numeric=parse_ver(actor_j_bwv)

                        jlei=True # flag j>= for ALL of i by default we assume it is true
                        for actor_i_bwv in browser_ver_list:
                            ver_i_numeric=parse_ver(actor_i_bwv)
                            for i in range(0, min(len(ver_i_numeric),len(ver_j_numeric))):
                                if ver_j_numeric[i]<ver_i_numeric[i]:
                                    jlei=False

                        if not cookie_interval.overlaps(actor_j_interval).any() and len(set(actor_j_panels).intersection(panel_list))>0 and jlei:
                            if not (j in merged_j):# cluster_found:
                                clusters[k].update({actor_j_cookie:{
                                    "interval": actor_j_interval,
                                    "panels": actor_j_panels,
                                    "browser_ver":actor_j_bwv}})
                                merged_j.append(j)
                                #cluster_found=True
                            else:
                                collision.append({"compid":cluster_id,
                                                  "total_dev":len(op),
                                                  "os_browser":cluster_key,
                                                  "group_size":len(group),
                                                  "cookie_i":actor_i_cookie,
                                                  "cookie_j":actor_j_cookie})
                                collision_count+=1

            cluster_list.append({"before_merge":len(group.index),"after_merged":m,"os_browser":cluster_key,"clusters":clusters})
            
            # Start building the final merged_devices dictionary with the processed cookies (merged or not merged)
            for cluster_items in clusters.items():
                # These cookies are now one device 
                joined_cookies = ",".join(list(cluster_items[1].keys()))
                if joined_cookies not in processed_cookie_clusters:
                    processed_cookie_clusters.add(joined_cookies)
                    merged_devices[device_id] = joined_cookies
                    device_id += 1
                    for cookie in list(cluster_items[1].keys()):
                        merged_cookies.add(cookie)

        merged_cc[cluster_id]=cluster_list
        cluster_id+=1

    # For all the panels and cookies that had no potential merging candidates, assign them a new device id
    island_cookies = all_cookies.difference(merged_cookies)
    for island_cookie in sorted(island_cookies):
        merged_devices[device_id] = island_cookie
        device_id += 1

    return merged_devices

In [4]:
# Load panel dataset
df = load_dataframe()
# Apply the cookie merging algorithm and get the device id to cookies dictionary
final_merged_devices = merge_panel_cookies(df)

# Assign each cookie their respective device id based on the merged cookies dictionary
df_merged=df.copy()
df_merged["operator_device_id"]=-1
merged_cookies_num = 0
merged_devices_num = 0
for device_id, device_cookies in final_merged_devices.items():
    cookies=device_cookies.split(",")
    if len(cookies) > 1 :
        merged_cookies_num += len(cookies)
        merged_devices_num += 1
    df_merged.loc[df_merged.actor_cookie.isin(cookies),"operator_device_id"]=device_id

print("%s cookies merged to %s device IDs" % (merged_cookies_num,merged_devices_num))

#Write the final dataframe to JSON
df_merged.to_json('panel_dataset_cookie_merged_final.json.gz', compression="gzip", orient="records", lines=True)

560 cookies merged to 214 device IDs
