# XDM Event Creation

This section creates random XDM data according to a (fairly) arbitrary schema

There are a few hacks and shortcomings right now:
- Lists aren't created properly
- I ignore a few XDM fields where a regex pattern is specified

In [440]:
import random, string
import uuid
from datetime import datetime, timedelta
import mmh3
import rstr
from random import randrange

def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

def normalize_ecid(ecid_part):
    ecid_part_str = str(abs(ecid_part))
    if len(ecid_part_str) != 19:
        ecid_part_str = "".join([str(x) for x in range(
            0, 19 - len(ecid_part_str))]) + ecid_part_str
    return ecid_part_str


def get_ecid(email):
    """
    The ECID must be two valid 19 digit longs concatenated together
    """
    ecidpart1, ecidpart2 = mmh3.hash64(email)
    ecid1, ecid2 = (normalize_ecid(ecidpart1), normalize_ecid(ecidpart2))
    return ecid1 + ecid2


def create_string_value(props):
    if "format" in props:
        if props["format"] == "uri-reference":
            return str(uuid.uuid4())
        elif props["format"] == "uri":
            return f"https://www.{''.join(random.choices(string.ascii_letters + string.digits, k=10))}.com"
    elif "examples" in props:
        return random.choice([eg for eg in props["examples"]])
    elif "pattern" in props:
        return rstr.xeger(props["pattern"])
    elif "meta:enum" in props:
        return random.choice([enum for enum in props["meta:enum"].keys()])
    elif "enum" in props:
        return random.choice(props["enum"])
    else:
        nchars = random.randint(2,10)
        return ''.join(random.choices(string.ascii_letters + string.digits, k=nchars))

def create_identity_map(email=None):
    nchars = random.randint(2,10)
    if email is None:
        email = ''.join(random.choices(string.ascii_letters + string.digits, k=nchars)) + "@adobe.com"
    ecid = get_ecid(email)        
    return {
            "ECID": [
                {
                    "id": f"{ecid}",
                    "primary": True
                }
            ],
            "email": [
                {
                    "id": f"{email}",
                    "primary": True
                }
            ]
        }

def create_map_value(field_name, props):
    if field_name == "identityMap":
        return create_identity_map()
    



def value_creator(xdm_type, field_name, props, start, end):
    match xdm_type:
        case "string":
            return create_string_value(props)
        case "map":
            return create_map_value(field_name, props)
        case "date-time":
            return random_date(start, end).isoformat()
        case "number":
            return random.uniform(0,90)
        case "int":
            return random.randint(0,20)
        case "long":
            return random.randint(2,10000)
        case "boolean":
            return False
        case "array":
            return create_array_values(field_name, props, start, end)
        case "object":
            #this is technically wrong
            return {}

def create_array_values(field_name, props, start, end):

    return [value_creator(props["items"]["meta:xdmType"], field_name, props, start, end)]


def create_or_get_first_element_list_leaf_node(json_tree, node_name):
    list_nodes = [node_name.partition(node)[0] for node in node_name.split("/") if node.endswith("[]{}")]
    
    
def descend_dict_tree(base_dict, node_path, from_end=0):
    node = base_dict
    node_path_list = node_path.split("/")
    for node_name in node_path_list[:len(node_path_list)-from_end]:
        node = node[node_name]
        
    return node

        
def change_key(d, path_to_value, new_value):
    for key in path_to_value[:-1]:
        d = d[key]
    d[path_to_value[-1]] = new_value
    
    
def copy_dict_tree(base_dict, source_node_path, target_node_path):
    source_node = descend_dict_tree(base_dict, source_node_path)
    
    change_key(base_dict, target_node_path.split("/"), source_node)
    
    return base_dict
        
        
def create_xdm_event(schema, 
                     user_id=None,
                     event_type=None,
                     overrides=None,
                     duplicates=None,
                     timestamp = None,
                     start_date=datetime.now() - timedelta(days=100), 
                     end_date=datetime.now()):
    """
    schema - a Schema that was grabbed from platform UI's network tab
    user_id - optional string
    event_type - optional string overriding events
    timestamp - optional string overriding timestamps
    duplicates  - List[(String, String)] describing source and target path that should be viewed as duplicates
    overrides - List[(String, Any)] describing target field to be modified, and value that it should take. 
    """
    
    mixins_to_ignore = ["[]{}", "videoSegment", "stateProvince", "currencyCode", "optionContent"]
        
    base_event = {}

    #Create base fields
    for key, value in schema["data"]["schemaComposition"]["schemaClass"]["properties"].items():
        base_event[key] = value_creator(value["meta:xdmType"], key, value, start=start_date, end=end_date)
        

    
    #Create mixins
    for key, value in schema["data"]["schemaComposition"]["schemaMixins"][0]["properties"].items():
        if any(x in key for x in mixins_to_ignore):
            continue

        if "/" in key:
            node = base_event
            for node_name in key.split("/")[:-1]:
                if node_name not in node or node[node_name] is None:
                    node[node_name] = {}
                node = node[node_name]

            node[key.split("/")[-1]] = value_creator(value["meta:xdmType"], key, value,  start=start_date, end=end_date)

        else:

            base_event[key] = value_creator(value["meta:xdmType"], key, value,  start=start_date, end=end_date)

    #overrides of base fields
    if user_id:
        base_event["identityMap"] = create_identity_map(user_id)
    if event_type:
        base_event["eventType"] = event_type 
    if timestamp:
        base_event["timestamp"] = timestamp
    if overrides:
        for path, value in overrides:
            change_key(base_event, path.split("/"), value)
    if duplicates:
        for (source_node_path, target_node_path) in duplicates:
            base_event = copy_dict_tree(base_event, source_node_path, target_node_path)

        
        
    return base_event

## Load an arbitrary schema and create event
Here, we load an arbitrary schema from file and create the random XDM event

In [441]:
import json

with open("../resources/xdm_schema.json", "r") as f:
    analytics_ee_schema = json.load(f)
    
event = create_xdm_event(analytics_ee_schema)
event

{'_id': '22a2d3cd-3397-4071-a680-0ae965f533c7',
 'eventMergeId': '4CZ1KV59L',
 'eventType': 'opportunityEvent.removeFromOpportunity',
 'identityMap': {'ECID': [{'id': '06242027923987868946752329435998598170',
    'primary': True}],
  'email': [{'id': 'otkoIHz@adobe.com', 'primary': True}]},
 'producedBy': 'system',
 'timestamp': '2022-10-24T13:25:39.047907',
 '_experience': {'analytics': {'customDimensions': {'eVars': {'eVar1': 'u2PRu',
     'eVar10': 'g0w5vvbO',
     'eVar100': 'A0WlfE7',
     'eVar101': 'Eh',
     'eVar102': 'wMMuocvv',
     'eVar103': 'FLXHz8nm',
     'eVar104': '3VTGINK',
     'eVar105': 'UD4WV8',
     'eVar106': 'Kt8V4',
     'eVar107': 'SizTi1H',
     'eVar108': 'DFf2S',
     'eVar109': 'uzIx',
     'eVar11': '1K',
     'eVar110': 'yHfYYdCU',
     'eVar111': '1pbl',
     'eVar112': 'NQAIh8gA8',
     'eVar113': 'S8ni0hpO',
     'eVar114': '9HON',
     'eVar115': 'YYGLPF295B',
     'eVar116': 'uOyJmq',
     'eVar117': 'xvJ6J6',
     'eVar118': 'kfJ0',
     'eVar119

In [442]:
event = create_xdm_event(analytics_ee_schema, user_id = "abc@adobe.com", event_type = "commerce.purchases", 
                         timestamp = datetime.now().isoformat(),
                        duplicates = [
                            ("web", "_experience/analytics/endUser/firstWeb"), 
                        ("web", "_experience/analytics/session/web"),
                        ("environment/operatingSystem", "_experience/analytics/customDimensions/eVars/eVar1"),
                        ("environment/viewedScreen", "_experience/analytics/customDimensions/eVars/eVar2"),
                        ("marketing/campaignGroup", "_experience/analytics/customDimensions/eVars/eVar3"),
                        ("placeContext/geo/city", "_experience/analytics/customDimensions/eVars/eVar4"),
                        ("placeContext/geo/countryCode", "_experience/analytics/customDimensions/eVars/eVar5"),
                        ("web/webInteraction/linkClicks/value", "_experience/analytics/event1to100/event1/value"),
                        ("web/webPageDetails/pageViews/value", "_experience/analytics/event1to100/event2/value"),
                        ("commerce/purchases/value", "_experience/analytics/event1to100/event3/value"),
                        ("commerce/productViews/value", "_experience/analytics/event1to100/event4/value"),
                        ("timestamp", "receivedTimestamp"),
                        ("timestamp", "userActivityRegion/captureTimestamp")],
                        overrides = [
                            ("web/webInteraction/linkClicks/value", 1.0)
                        ])


### Count number of keys in experience event
One interesting aspect of XDM data is that it dense, and contains a ton of data. 
Let's check the number of fields we've populated

In [443]:
import pandas as pd
#convert to pandas, which has nice json normalize
flat_dict = pd.json_normalize(event).to_dict(orient="records")[0]
print("Number of keys = ", len(flat_dict.keys()))

Number of keys =  3026


In [444]:
with open("example_event_indent_new.json", "w") as f:
    json.dump(event, f, indent=2)
with open("example_event_new.json", "w") as f:
    json.dump(event, f)

# Create a detailed simulation

The next step is to create a more detailed simulation that will allow a reasonable propensity model to be built. 

Our goal in this task will be to create a propensity model for "subscription" events.

A subscription event will be defined as an event where a `web.formFilledOut` event is recorded, with a specific 
These will be events where a customer subscribes to the 

## EventTypes and their contribution to propensity

We will allow for several types of experience events to be received for each user.
For each user, we will create a "generative" model of subscription as follows:

1. We sample randomly from a poisson distribution for the number of advertising impressions, webPageViews, emailsSent. **These events can happen at random times over a 10 week interval.**
2. For each of these "base" exposure events, we then have a corresponding conversion:
    - If an advertising impression occurs, we then allow for an advertising click to happen, with a certain probability.  
    - If a web Page view occurs, then linkClicks, productViews, purchases, propositionDisplays, Interacts and Dismisses can all occur.
    - For an Email Sent, opens and clicks can then also occur. 
    
3. After all these base events have been generated, we then have a timeseries of events for each user. Each of the timeseries events affects the user's propensity to subscribe. After each event the user then has a certain probability of subscribing. The subscription is then evaluated with a Bernoulli draw - if the user subscribes, no further subscription evaluations are made. If the subscription does not happen, the subscription possibility will continue to be evaluated. 

4. Extra - if more than 10 advertising impressions, or 5 emails are sent, the user churns, and no more events for that user are generated. 



In [445]:
advertising_events = {
 
    #eventType          : (weeklyAverageOccurrence, propensityDelta, [(field_to_replace, value)], timeInHoursFromDependent)
    "advertising.clicks": (0.01,                    0.002,            [("advertising/clicks/value", 1.0)], 0.5) , 
    "advertising.impressions": (0.1, 0.001, [("advertising/impressions/value", 1.0)], 0),

    "web.webpagedetails.pageViews": (0.1, 0.005, [("web/webPageDetails/pageViews/value", 1.0)], 0.1),
    "web.webinteraction.linkClicks": (0.05, 0.005, [("web/webInteraction/linkClicks/value", 1.0)], 0.1),
   
    
    "commerce.productViews": (0.05, 0.005, [("commerce/productViews/value", 1.0)], 0.2),
    "commerce.purchases": (0.01, 0.1, [("commerce/purchases/value", 1.0)], 1),
    
    
    "decisioning.propositionDisplay": (0.05, 0.005, [("_experience/decisioning/propositionEventType/display", 1)], 0.1),
    "decisioning.propositionInteract": (0.01, 0.1, [("_experience/decisioning/propositionEventType/interact", 1)], 0.05),
    "decisioning.propositionDismiss": (0.01, -0.2, [("_experience/decisioning/propositionEventType/dismiss", 1)], 0.05),

    
    "directMarketing.emailOpened": (0.2, 0.02, [("directMarketing/opens/value", 1.0)], 24),
    "directMarketing.emailClicked": (0.05, 0.1, [("directMarketing/clicks/value", 1.0)], 0.5),
    "directMarketing.emailSent": (0.5, 0.005, [("directMarketing/sends/value", 1.0)], 0),
    
    "web.formFilledOut": (0.0, 0.0, [("web/webPageDetails/name", "subscriptionForm")], 0),

}

event_dependencies = {
    "advertising.impressions": ["advertising.clicks"],
    "directMarketing.emailSent": ["directMarketing.emailOpened"],
    "directMarketing.emailOpened": ["directMarketing.emailClicked"],
    "directMarketing.emailClicked": ["web.webpagedetails.pageViews"],
    "web.webpagedetails.pageViews": ["web.webinteraction.linkClicks", "commerce.productViews", "decisioning.propositionDisplay"],
    "commerce.productViews": ["commerce.purchases"],
    "decisioning.propositionDisplay": ["decisioning.propositionInteract", "decisioning.propositionDismiss"]
    
}


In [516]:
import numpy as np
from datetime import datetime
import math

N_USERS = 1000
FIRST_USER = 9000


N_WEEKS = 10
GLOBAL_START_DATE = datetime.now() - timedelta(weeks=12)
GLOBAL_END_DATE = GLOBAL_START_DATE + timedelta(weeks=N_WEEKS)

events = []
for user in range(N_USERS):
    user_id = FIRST_USER + user
    user_events = []
    base_events = {}
    for event_type in ["advertising.impressions", "web.webpagedetails.pageViews", "directMarketing.emailSent"]:
        n_events = np.random.poisson(advertising_events[event_type][0] * N_WEEKS)
        times = []
        for _ in range(n_events):
            times.append(random_date(GLOBAL_START_DATE, GLOBAL_END_DATE))
        
        base_events[event_type] = times
        
    for event_type, dependent_event_types in event_dependencies.items():
        
        if event_type in base_events:
            #for each originating event
            for event_time in base_events[event_type]:
                #Look for possible later on events
                for dependent_event in dependent_event_types:
                            n_events = np.random.poisson(advertising_events[dependent_event][0] * N_WEEKS)
                            times = []
                            for _ in range(n_events):
                                times.append(event_time + timedelta(hours = np.random.exponential(advertising_events[dependent_event][3])))
                            base_events[dependent_event] = times
        

    for event_type, times in base_events.items():
        for time in times:
            user_events.append({"userId": user_id, "eventType": event_type, "timestamp": time})
        
    user_events = sorted(user_events, key = lambda x: (x["userId"], x["timestamp"]))
    
   
    cumulative_probability = 0.001
    subscribed = False
    for event in user_events:
        cumulative_probability = min(1.0, max(cumulative_probability + advertising_events[event["eventType"]][1], 0))
        event["subscriptionPropensity"] = cumulative_probability
        if subscribed == False and "directMarketing" not in event["eventType"] and "advertising" not in event["eventType"]:
            subscribed = np.random.binomial(1, cumulative_probability) > 0
            if subscribed:
                subscriptiontime = event["timestamp"] + timedelta(seconds = 60)
                user_events.append({"userId": user_id, "eventType": "web.formFilledOut",  "timestamp": subscriptiontime})
        event["subscribed"] = subscribed
    user_events = sorted(user_events, key = lambda x: (x["userId"], x["timestamp"]))
           
    events = events + user_events
    
    

In [517]:
df = pd.DataFrame(events)


users_subscribing = df[df["subscribed"]]["userId"].unique()

print("Total Rows = ",len(df))

df[df["userId"] == users_subscribing[0]]

Total Rows =  10801


Unnamed: 0,userId,eventType,timestamp,subscriptionPropensity,subscribed
0,9000,directMarketing.emailSent,2022-08-03 16:32:37.231502,0.006,False
1,9000,directMarketing.emailSent,2022-08-05 12:09:59.231502,0.011,False
2,9000,directMarketing.emailSent,2022-08-19 17:17:59.231502,0.016,False
3,9000,directMarketing.emailSent,2022-08-26 01:45:06.231502,0.021,False
4,9000,directMarketing.emailSent,2022-09-03 15:57:23.231502,0.026,False
5,9000,advertising.impressions,2022-09-04 20:21:17.231502,0.027,False
6,9000,directMarketing.emailSent,2022-09-16 07:02:27.231502,0.032,False
7,9000,directMarketing.emailSent,2022-09-20 11:42:47.231502,0.037,False
8,9000,directMarketing.emailSent,2022-09-30 04:31:28.231502,0.042,False
9,9000,directMarketing.emailOpened,2022-09-30 23:13:10.873274,0.062,False


# Create XDM Data with simulation from above

We now create the underlying XDM data from the simualtion above. We loop through events, and generate xdm events, storing these in an array that is written to a file.

In [518]:
with open("../resources/xdm_schema.json", "r") as f:
    analytics_ee_schema = json.load(f)



xdm_events = []


for event in sorted(events, key = lambda x: x["timestamp"]):    
    xdm_event = create_xdm_event(analytics_ee_schema, 
                                 user_id = f"synthetic-user-{event['userId']}@adobe.com", 
                                 event_type = event["eventType"], 
                                 timestamp = event["timestamp"].isoformat(),
                                duplicates = [
                                    ("web", "_experience/analytics/endUser/firstWeb"), 
                                ("web", "_experience/analytics/session/web"),
                                ("environment/operatingSystem", "_experience/analytics/customDimensions/eVars/eVar1"),
                                ("environment/viewedScreen", "_experience/analytics/customDimensions/eVars/eVar2"),
                                ("marketing/campaignGroup", "_experience/analytics/customDimensions/eVars/eVar3"),
                                ("placeContext/geo/city", "_experience/analytics/customDimensions/eVars/eVar4"),
                                ("placeContext/geo/countryCode", "_experience/analytics/customDimensions/eVars/eVar5"),
                                ("web/webInteraction/linkClicks/value", "_experience/analytics/event1to100/event1/value"),
                                ("web/webPageDetails/pageViews/value", "_experience/analytics/event1to100/event2/value"),
                                ("commerce/purchases/value", "_experience/analytics/event1to100/event3/value"),
                                ("commerce/productViews/value", "_experience/analytics/event1to100/event4/value"),
                                ("timestamp", "receivedTimestamp"),
                                ("timestamp", "userActivityRegion/captureTimestamp")],
                                overrides = advertising_events[event["eventType"]][2])
    xdm_events.append(xdm_event)
    
    
    

In [506]:
from typing import List, Dict
def write_rows_to_file(rows: List[Dict], outputfile: str, prefix="test-analytics-data") -> str:

    if outputfile is None:
        outputfile = f"{prefix}-{datetime.now().isoformat()}.json"

    json_data = [json.dumps(row) for row in rows]

    with open(outputfile, 'w') as f:
        f.write("\n".join(json_data))

    return outputfile

In [519]:
write_rows_to_file(xdm_events, outputfile=None)

'test-analytics-data-2022-10-25T20:48:28.679200.json'