In [1]:
%run ../base_setup.ipynb

Foundation: 2.09.024
Transition: 2.09.022
Engines   : 2.03.015
Behavioral: 2.04.016


-------------------------
## Build the raw synthetic file

In [2]:
builder = SyntheticBuilder.from_env('members')

In [3]:
tools = builder.intent_model

In [4]:
builder.pm.reset_intents()
builder.pm_persist()

In [5]:
sample_size = 1000

In [6]:
df = pd.DataFrame()

In [7]:
# add a reference id
df['member_id'] = tools.get_number(range_value=10000000, to_value=100000000, precision=0, at_most=1, size=sample_size, column_name='member_id')
builder.add_column_description(column_name='member_id', description='A unique reference number limited to one million-1')

In [8]:
df['gender'] = tools.get_category(selection=['M', 'F'], weight_pattern=[5,3], size=sample_size, column_name='gender')
builder.add_column_description(column_name='gender', description="A gender catagory of Males and Females with bias of 5 to 3 of male to frmale")

In [9]:
state = ["California", "New York", "Louisiana", "New Jersey", "Virginia", "Colorado", "Nevada", 
         "Georgia", "Indiana", "Ohio", "Kentucky", "Maine", "Missouri", "Wisconsin"]
df['state'] = tools.get_category(selection=state, weight_pattern=[20, 30, 10, 5, 2, 5, 5, 1, 2, 2, 1, 1, 5, 1], size=sample_size, column_name='state')
builder.add_column_description(column_name='state', description="Only covering 14 states with a majority prediminace in California, New York and Louisiana")

In [10]:
# has phone number
df['has_phone_number'] = tools.get_category(selection=[1, 0], weight_pattern=[3,1], size=sample_size, column_name='has_phone_number')
builder.add_column_description(column_name='has_phone_number', description="If the person has a phone number on record with a weight bias of 3 to 1 they have")
builder.add_column_description(column_name='has_phone_number', description="There are no nulls by request")


In [11]:
# ensure only thoise with Phone numbers elect to not be called
selection = [builder.tools.select2dict(column='has_phone_number', condition='==1')]

action = builder.tools.action2dict(method='get_category', selection=[1, 0], weight_pattern=[1,9])
default = builder.tools.action2dict(method='@constant', value=0)

df['do_not_call'] = builder.tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='do_not_call')
builder.add_column_description(column_name='do_not_call', description="if thas a phone number, if they electyed to be called")

In [12]:
# communication channel preference
channels = ["SydneyCare", "RCP", "Phone", "Email", "SMS"]
df['channel_pref'] = tools.get_category(selection=channels, weight_pattern=[2,1,1,2,2], size=sample_size, column_name='channel_pref', intent_order=0)
builder.add_column_description(column_name='channel_pref', description="The channel through which the member contacted")

In [13]:
# ensure if the preference is do not call the channel preference is not Phone
selection = [builder.tools.select2dict(column='do_not_call', condition='==1'),
             builder.tools.select2dict(column='channel_pref', condition="'Phone'", operator='==', logic='AND')]

action = builder.tools.action2dict(method='get_category', selection=[1, 0], weight_pattern=[1,9])
default = builder.tools.action2dict(method='@constant', value=0)

df['channel_pref'] = builder.tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='channel_pref', intent_order=1)
builder.add_column_description(column_name='channel_pref', description="ensure that if do_not_call is true the preference isn't by phone")

In [14]:
# ensure if the member has no phone number the channel preference is not SMS or Phone
selection = [builder.tools.select2dict(column='has_phone_number', condition='==0')]

action = builder.tools.action2dict(method='get_category', selection=["SydneyCare", "RCP", "Email"], weight_pattern=[2,1,2])
default = builder.tools.action2dict(method='@header', header='channel_pref')

df['channel_pref'] = builder.tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='channel_pref', intent_order=2)
builder.add_column_description(column_name='channel_pref', description="ensure if the member has no phone number the channel preference is not SMS or Phone")

In [15]:
# segment
segment = ["Local Group", "National Accounts", "Medicaid", "BlueCard", "Medicare", "FEP", "Individual"]
df['segment'] = tools.get_category(selection=segment, weight_pattern=[38, 19, 18, 15, 5, 4, 2], size=sample_size, column_name='segment')
builder.add_column_description(column_name='segment', description="The heathcare segment of the member with bias weighting")

In [16]:
# age groups
df['age'] =  tools.get_number(range_value=18, to_value=90, weight_pattern=[1,2,6,4,3,1,0.1], size=sample_size, column_name='age', intent_order=0)
builder.add_column_description(column_name='age', description="age has a distribution that peaks around the mid 40's with a negative skew and older outliers")

In [17]:
# Adjust age for medicare being over 65
selection = [tools.select2dict(column='segment', condition="=='Medicare'")]


action = tools.action2dict(method='get_number', range_value=65, to_value=90, weight_pattern=[4,1,0.1])
default = tools.action2dict(method='@header', header='age')

df['age'] = tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='age', intent_order=1)
builder.add_column_description(column_name='segment', description="Age is a conditional influencer with Medicare being exclusivly over 65")

In [18]:
# flu_shot
df['flu_shot'] = tools.get_category(selection=[1, 0], weight_pattern=[4.5, 5.5], size=sample_size, column_name='flu_shot', intent_order=0)
builder.add_column_description(column_name='flu_shot', description="If the idividual has had a flu shot in the last year")
builder.add_column_description(column_name='flu_shot', description="Distribution based upon the figures taken from The Advisory Committee on Immunization Practices (ACIP)")

In [19]:
# adjust flu shot for under 49
selection = [builder.tools.select2dict(column='age', condition='<49')]

action = builder.tools.action2dict(method='get_category', selection=[1, 0], weight_pattern=[3,7])
default = builder.tools.action2dict(method='@header', header='flu_shot')

df['flu_shot'] = builder.tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='flu_shot', intent_order=1)

In [20]:
# adjust flu shot for over 75
selection = [builder.tools.select2dict(column='age', condition='>65')]

action = builder.tools.action2dict(method='get_category', selection=[1, 0], weight_pattern=[6.5, 3.5])
default = builder.tools.action2dict(method='@header', header='flu_shot')

df['flu_shot'] = builder.tools.correlate_selection(df, selection=selection, action=action, default_action=default, column_name='flu_shot', intent_order=2)

In [21]:
# email
df['email'] = tools.get_email(size=sample_size, column_name='email')
builder.add_column_description(column_name='email', description="A realistc scaled email address that is guaranteed unique")

In [22]:
# Latent_has_children
df['latent_has_children'] = tools.get_category(selection=[1, 0], weight_pattern=[4, 5], size=sample_size, column_name='latent_has_children')
builder.add_column_description(column_name='latent_has_children', description="a latent ditribution to indicate a member has children. Based on number of population with children")


In [23]:
# latent_has_travelled
df['latent_has_travelled'] = tools.get_category(selection=[1, 0], weight_pattern=[1,20], size=sample_size, column_name='latent_has_travelled')
builder.add_column_description(column_name='latent_has_travelled', description="a latent ditribution to indicate a member has recently travelled.")


In [24]:
builder.run_synthetic_pipeline(size=1000)

In [25]:
builder.report_column_catalog()

Unnamed: 0,column_name,description
0,age,age has a distribution that peaks around the mid 40's with a negative skew and older outliers
1,channel_pref,"The channel through which the member contacted, ensure that if do_not_call is true the preference isn't by phone, ensure if the member has no phone number the channel preference is not SMS or Phone"
2,do_not_call,"if thas a phone number, if they electyed to be called"
3,email,A realistc scaled email address that is guaranteed unique
4,flu_shot,"If the idividual has had a flu shot in the last year, Distribution based upon the figures taken from The Advisory Committee on Immunization Practices (ACIP)"
5,gender,A gender catagory of Males and Females with bias of 5 to 3 of male to frmale
6,has_phone_number,"If the person has a phone number on record with a weight bias of 3 to 1 they have, There are no nulls by request"
7,latent_has_children,a latent ditribution to indicate a member has children. Based on number of population with children
8,latent_has_travelled,a latent ditribution to indicate a member has recently travelled.
9,member_id,A unique reference number limited to one million-1


In [26]:
builder.report_intent()

Unnamed: 0_level_0,order,intent,parameters,creator
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,0,get_number,"[range_value=18, to_value=90, weight_pattern=[1, 2, 6, 4, 3, 1, 0.1], column_name='age']",doatridge
,1,correlate_selection,"[selection=[{'column': 'segment', 'condition': ""=='Medicare'""}], action={'method': 'get_number', 'range_value': 65, 'to_value': 90, 'weight_pattern': [4, 1, 0.1]}, default_action={'method': '@head...",doatridge
channel_pref,0,get_category,"[selection=['SydneyCare', 'RCP', 'Phone', 'Email', 'SMS'], weight_pattern=[2, 1, 1, 2, 2], column_name='channel_pref']",doatridge
,1,correlate_selection,"[selection=[{'column': 'do_not_call', 'condition': '==1'}, {'column': 'channel_pref', 'condition': ""'Phone'"", 'operator': '==', 'logic': 'AND'}], action={'method': 'get_category', 'selection': [1,...",doatridge
,2,correlate_selection,"[selection=[{'column': 'has_phone_number', 'condition': '==0'}], action={'method': 'get_category', 'selection': ['SydneyCare', 'RCP', 'Email'], 'weight_pattern': [2, 1, 2]}, default_action={'metho...",doatridge
do_not_call,0,correlate_selection,"[selection=[{'column': 'has_phone_number', 'condition': '==1'}], action={'method': 'get_category', 'selection': [1, 0], 'weight_pattern': [1, 9]}, default_action={'method': '@constant', 'value': 0...",doatridge
email,0,get_email,[column_name='email'],doatridge
flu_shot,0,get_category,"[selection=[1, 0], weight_pattern=[4.5, 5.5], column_name='flu_shot']",doatridge
,1,correlate_selection,"[selection=[{'column': 'age', 'condition': '<49'}], action={'method': 'get_category', 'selection': [1, 0], 'weight_pattern': [3, 7]}, default_action={'method': '@header', 'header': 'flu_shot'}, co...",doatridge
,2,correlate_selection,"[selection=[{'column': 'age', 'condition': '>65'}], action={'method': 'get_category', 'selection': [1, 0], 'weight_pattern': [6.5, 3.5]}, default_action={'method': '@header', 'header': 'flu_shot'}...",doatridge


-------------------------
## Transition

In [27]:
tr = Transition.from_env('members')

In [28]:
tr.pm.reset_intents()
tr.pm_persist()

In [29]:
df = tr.load_source_canonical()

In [30]:
df = tr.intent_model.auto_clean_header(df)
df = tr.intent_model.auto_transition(df)
df = tr.intent_model.to_remove(df, regex=['latent_'])

In [31]:
tr.canonical_report(df)

Unnamed: 0,Attributes (10),dType,%_Null,%_Dom,Count,Unique,Observations
0,age,int64,0.0%,4.2%,1000,65,max=85 | min=18 | mean=49.34 | dominant=39
1,channel_pref,category,0.0%,77.9%,1000,4,Sample: 0 | SydneyCare | Email | RCP
2,do_not_call,bool,0.0%,92.4%,1000,2,True | False
3,email,object,0.0%,0.1%,1000,1000,Sample: mvasos@msn.com | ocupps@gmx.net | fmounts@facebook.com | irongo@facebook.com | jappelbaum@go...
4,flu_shot,bool,0.0%,55.8%,1000,2,True | False
5,gender,category,0.0%,59.1%,1000,2,Sample: M | F
6,has_phone_number,bool,0.0%,77.9%,1000,2,True | False
7,member_id,int64,0.0%,0.1%,1000,1000,"max=99977261 | min=10034108 | mean=56845317.84 | dominant=[10034108, 10170540]"
8,segment,category,0.0%,35.2%,1000,7,Sample: Local Group | National Accounts | Medicaid | BlueCard | Medicare
9,state,category,0.0%,34.3%,1000,14,Sample: New York | California | Louisiana | New Jersey | Nevada


In [32]:
tr.run_transition_pipeline()