In [1]:
import pandas as pd

In [2]:
df_retail = pd.read_csv('../data/retailer.csv')
programatic = pd.read_csv('../data/programmatic_publisher.csv')
map = pd.read_csv('../data/mapping_transac_publisher_tv.csv')
tv = pd.read_csv('../data/tv_publisher.csv')


In [3]:
unique_customer_retail = set(df_retail['customer_id'].to_list())
map = map[map['customer_id'].isin(unique_customer_retail)]

In [4]:
pro_map = map[map['dsp_id']!='unknown'].merge(programatic, on='dsp_id', how='left')
pro_map = pro_map[pro_map['customer_id']!='unknown']

In [5]:
tv_map = map[map['device_id']!='unknown'].merge(tv, on='device_id', how='left')
tv_map = tv_map[tv_map['customer_id']!='unknown']

In [6]:
df_retail = df_retail[df_retail['customer_id']!='unknown']

In [7]:
pro_map['source_data'] = 'programatic'
tv_map['source_data'] = 'tv'
df_retail['source_data'] = 'retail'

In [8]:
pro_map['timestamp_utc'] = pd.to_datetime(pro_map['timestamp_utc'])
df_retail['timestamp_utc'] = pd.to_datetime(df_retail['timestamp_utc'])
tv_map['timestamp_utc'] = pd.to_datetime(tv_map['timestamp_utc'])

In [9]:
pro_map = pro_map[pro_map['timestamp_utc']>'2024-02-29 00:00:00']
df_retail = df_retail[df_retail['timestamp_utc']>'2024-02-29 00:00:00']
tv_map = tv_map[tv_map['timestamp_utc']>'2024-02-29 00:00:00']

In [10]:
pro_map.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1811293 entries, 3 to 4264085
Data columns (total 8 columns):
 #   Column           Dtype         
---  ------           -----         
 0   customer_id      object        
 1   dsp_id           object        
 2   device_id        object        
 3   timestamp_utc    datetime64[ns]
 4   campaign_name    object        
 5   device_type      object        
 6   cost_milli_cent  float64       
 7   source_data      object        
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 124.4+ MB


In [11]:
pro_map = pd.get_dummies(pro_map, columns=['campaign_name'], dtype=int)
pro_map = pd.get_dummies(pro_map, columns=['device_type'], dtype=int)
pro_map.drop(columns=['device_id', 'dsp_id'], inplace=True)
pro_map.rename(columns={'cost_milli_cent':'cost_milli_cent_programmatic'}, inplace=True)

In [12]:
pro_map

Unnamed: 0,customer_id,timestamp_utc,cost_milli_cent_programmatic,source_data,campaign_name_Contextual,campaign_name_Retargeting,device_type_PC,device_type_Phone,device_type_TV,device_type_Unknown
3,reutQ3jiBX9Li4Ggqi,2024-06-16 20:55:27,601.930,programatic,1,0,0,0,1,0
4,reH7UgH29AreRh8wWy,2024-06-26 19:15:54,1191.750,programatic,1,0,0,0,1,0
5,reH7UgH29AreRh8wWy,2024-05-31 18:38:54,739.262,programatic,0,1,0,0,1,0
6,reH7UgH29AreRh8wWy,2024-06-05 22:50:03,153.500,programatic,0,1,1,0,0,0
7,reHAnmLXgIZqJT0i64,2024-05-09 21:06:22,5878.010,programatic,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4264069,reekGgat3Kzr52qMUX,2024-03-13 07:54:30,425.913,programatic,0,1,0,0,1,0
4264070,reekGgat3Kzr52qMUX,2024-03-31 18:16:15,280.017,programatic,0,1,0,0,1,0
4264071,reekGgat3Kzr52qMUX,2024-04-20 00:55:23,192.684,programatic,0,1,0,0,1,0
4264077,re41cv7tOaVBHcqCPM,2024-05-27 13:11:54,2875.684,programatic,0,1,1,0,0,0


In [13]:
df_retail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6146286 entries, 1 to 9866045
Data columns (total 8 columns):
 #   Column         Dtype         
---  ------         -----         
 0   customer_id    object        
 1   timestamp_utc  datetime64[ns]
 2   event_name     object        
 3   brand          object        
 4   product_name   object        
 5   sales          float64       
 6   quantity       float64       
 7   source_data    object        
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 422.0+ MB


In [14]:
tv_map.info()

<class 'pandas.core.frame.DataFrame'>
Index: 341760 entries, 0 to 1570129
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   customer_id      341760 non-null  object        
 1   dsp_id           341760 non-null  object        
 2   device_id        341760 non-null  object        
 3   timestamp_utc    341760 non-null  datetime64[ns]
 4   cost_milli_cent  341760 non-null  float64       
 5   source_data      341760 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 18.3+ MB


In [15]:
tv_map.drop(columns=['device_id', 'dsp_id'], inplace=True)
tv_map.rename(columns={'cost_milli_cent':'cost_milli_cent_tv'}, inplace=True)

In [16]:
full_history = pd.concat([df_retail, pro_map, tv_map], ignore_index=True)

In [17]:
full_history.sort_values(by=['customer_id', 'timestamp_utc'], inplace=True)

In [18]:
len(full_history[full_history['event_name']=='Order']) + len(full_history[full_history['event_name']!='Order']) == len(full_history)

True

### Order

In [19]:
order_history = full_history[(full_history['event_name']!='Product Page View') & (full_history['event_name']!='Add to cart')]

In [20]:
order_history['cost_milli_cent_programmatic'].sum()/100000

np.float64(14074.67311117)

In [21]:
order_history

Unnamed: 0,customer_id,timestamp_utc,event_name,brand,product_name,sales,quantity,source_data,cost_milli_cent_programmatic,campaign_name_Contextual,campaign_name_Retargeting,device_type_PC,device_type_Phone,device_type_TV,device_type_Unknown,cost_milli_cent_tv
2257678,re000fIO9QXTWYjOfn,2024-03-13 02:58:00,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,retail,,,,,,,,
2257679,re000fIO9QXTWYjOfn,2024-05-24 18:10:37,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,retail,,,,,,,,
2257680,re000fIO9QXTWYjOfn,2024-06-04 18:27:57,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,retail,,,,,,,,
2257681,re000fIO9QXTWYjOfn,2024-06-21 22:04:39,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,retail,,,,,,,,
3352445,re000pHbVOysCXRHgt,2024-03-27 16:13:49,Order,Science Diet,SD Ca Adt Lt Ckn 30lb bg,78.99,1.0,retail,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2203902,rezzz8WfdxS4BBGCo8,2024-04-20 21:15:13,Order,Prescription Diet,PD Ca c/d Mul Ckn&VgStew 24x5.5oz cs,60.99,1.0,retail,,,,,,,,
8062821,rezzzZvkIaiWNQ1AmV,2024-06-26 23:50:31,,,,,,tv,,,,,,,,2325.51
8062822,rezzzZvkIaiWNQ1AmV,2024-06-30 04:39:58,,,,,,tv,,,,,,,,2325.51
4142213,rezzzipns16pTCb4OS,2024-04-18 01:35:45,Order,Science Diet,SD Ca Adt Lt SB Ckn 5lb bg,19.99,1.0,retail,,,,,,,,


In [None]:
order_history = order_history.sort_values(by=['customer_id', 'timestamp_utc'])

ad_columns = [
    'cost_milli_cent_programmatic', 'campaign_name_Contextual', 'campaign_name_Retargeting',
    'device_type_PC', 'device_type_Phone', 'device_type_TV', 'device_type_Unknown', 'cost_milli_cent_tv'
]

ad_type_column = 'source_data'
order_indicator = 'retail'

order_data = []
current_sums = {col: 0 for col in ad_columns}
count_tv = 0
count_programmatic = 0
ad_timestamps = []
tv_timestamps = []
programmatic_timestamps = []

for _, row in order_history.iterrows():
    if row['source_data'] == order_indicator:
        ad_frequency_count = len(ad_timestamps)

        if len(ad_timestamps) > 1:
            ad_timestamps = sorted(ad_timestamps)
            time_diffs = [(ad_timestamps[i] - ad_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(ad_timestamps))]
            avg_time_between_ads = sum(time_diffs) / len(time_diffs)
        else:
            avg_time_between_ads = None

        if len(tv_timestamps) > 1:
            tv_timestamps = sorted(tv_timestamps)
            tv_time_diffs = [(tv_timestamps[i] - tv_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(tv_timestamps))]
            avg_time_between_tv_ads = sum(tv_time_diffs) / len(tv_time_diffs)
        else:
            avg_time_between_tv_ads = None

        if len(programmatic_timestamps) > 1:
            programmatic_timestamps = sorted(programmatic_timestamps)
            programmatic_time_diffs = [(programmatic_timestamps[i] - programmatic_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(programmatic_timestamps))]
            avg_time_between_programmatic_ads = sum(programmatic_time_diffs) / len(programmatic_time_diffs)
        else:
            avg_time_between_programmatic_ads = None

        order_data.append({
            'customer_id': row['customer_id'],
            'timestamp_utc': row['timestamp_utc'],
            'event_name' : row['event_name'],
            'brand' : row['brand'],
            'product_name' : row['product_name'],
            'sales' : row['sales'],
            'quantity': row['quantity'],
            **current_sums,
            'count_tv_ads': count_tv,
            'count_programmatic_ads': count_programmatic,
            'ad_frequency_count': ad_frequency_count,
            'avg_time_between_ads_days': avg_time_between_ads,
            'avg_time_between_tv_ads_days': avg_time_between_tv_ads,
            'avg_time_between_programmatic_ads_days': avg_time_between_programmatic_ads
        })

        current_sums = {col: 0 for col in ad_columns}
        count_tv = 0
        count_programmatic = 0
        ad_timestamps = []
        tv_timestamps = []
        programmatic_timestamps = []

    else:
        for col in ad_columns:
            if pd.notna(row[col]):
                current_sums[col] += row[col]

        if row[ad_type_column] == 'programatic':
            count_programmatic += 1
            programmatic_timestamps.append(row['timestamp_utc'])
        elif row[ad_type_column] == 'tv':
            count_tv += 1
            tv_timestamps.append(row['timestamp_utc'])

        ad_timestamps.append(row['timestamp_utc'])

df_orders = pd.DataFrame(order_data)


In [23]:
df_orders

Unnamed: 0,customer_id,timestamp_utc,event_name,brand,product_name,sales,quantity,cost_milli_cent_programmatic,campaign_name_Contextual,campaign_name_Retargeting,...,device_type_Phone,device_type_TV,device_type_Unknown,cost_milli_cent_tv,count_tv_ads,count_programmatic_ads,ad_frequency_count,avg_time_between_ads_days,avg_time_between_tv_ads_days,avg_time_between_programmatic_ads_days
0,re000fIO9QXTWYjOfn,2024-03-13 02:58:00,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
1,re000fIO9QXTWYjOfn,2024-05-24 18:10:37,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
2,re000fIO9QXTWYjOfn,2024-06-04 18:27:57,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
3,re000fIO9QXTWYjOfn,2024-06-21 22:04:39,Order,Science Diet,SD Ca Adt SavStw S&TB Bf&Vg 12x3.5oz cs,34.32,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
4,re000pHbVOysCXRHgt,2024-03-27 16:13:49,Order,Science Diet,SD Ca Adt Lt Ckn 30lb bg,78.99,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
934783,rezzymDnsupIX7Q3TS,2024-05-22 14:36:42,Order,Science Diet,SD Ca A7+ SB Ckn 5lb bg,19.99,1.0,1901.966,3.0,1.0,...,2.0,1.0,0.0,2325.51,1,4,5,26.884560,,33.012681
934784,rezzz8WfdxS4BBGCo8,2024-03-23 18:24:01,Order,Prescription Diet,PD Ca c/d Mul Ckn&VgStew 24x5.5oz cs,60.99,1.0,146.610,1.0,0.0,...,1.0,0.0,0.0,0.00,0,1,1,,,
934785,rezzz8WfdxS4BBGCo8,2024-04-20 21:15:13,Order,Prescription Diet,PD Ca c/d Mul Ckn&VgStew 24x5.5oz cs,60.99,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
934786,rezzzipns16pTCb4OS,2024-04-18 01:35:45,Order,Science Diet,SD Ca Adt Lt SB Ckn 5lb bg,19.99,1.0,0.000,0.0,0.0,...,0.0,0.0,0.0,4651.02,2,0,2,3.201007,3.201007,


In [24]:
df_orders['cost_milli_cent_programmatic'].sum()/100000

np.float64(14074.673111170003)

In [30]:
df_orders.to_csv('../data/orders_ads_history.csv')

## Product page view

In [25]:
product_page_history = full_history[(full_history['event_name']!='Order') & (full_history['event_name']!='Add to cart')]

In [26]:
product_page_history['cost_milli_cent_programmatic'].sum()/100000

np.float64(14074.673111169972)

In [None]:
product_page_history = product_page_history.sort_values(by=['customer_id', 'timestamp_utc'])

ad_columns = [
    'cost_milli_cent_programmatic', 'campaign_name_Contextual', 'campaign_name_Retargeting',
    'device_type_PC', 'device_type_Phone', 'device_type_TV', 'device_type_Unknown', 'cost_milli_cent_tv'
]

ad_type_column = 'source_data'
order_indicator = 'retail'

order_data = []
current_sums = {col: 0 for col in ad_columns}
count_tv = 0
count_programmatic = 0
ad_timestamps = []
tv_timestamps = []
programmatic_timestamps = []

for _, row in product_page_history.iterrows():
    if row['source_data'] == order_indicator:
        ad_frequency_count = len(ad_timestamps)

        if len(ad_timestamps) > 1:
            ad_timestamps = sorted(ad_timestamps)
            time_diffs = [(ad_timestamps[i] - ad_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(ad_timestamps))]
            avg_time_between_ads = sum(time_diffs) / len(time_diffs)
        else:
            avg_time_between_ads = None

        if len(tv_timestamps) > 1:
            tv_timestamps = sorted(tv_timestamps)
            tv_time_diffs = [(tv_timestamps[i] - tv_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(tv_timestamps))]
            avg_time_between_tv_ads = sum(tv_time_diffs) / len(tv_time_diffs)
        else:
            avg_time_between_tv_ads = None

        if len(programmatic_timestamps) > 1:
            programmatic_timestamps = sorted(programmatic_timestamps)
            programmatic_time_diffs = [(programmatic_timestamps[i] - programmatic_timestamps[i - 1]).total_seconds() / 86400 for i in range(1, len(programmatic_timestamps))]
            avg_time_between_programmatic_ads = sum(programmatic_time_diffs) / len(programmatic_time_diffs)
        else:
            avg_time_between_programmatic_ads = None

        order_data.append({
            'customer_id': row['customer_id'],
            'timestamp_utc': row['timestamp_utc'],
            'event_name' : row['event_name'],
            'brand' : row['brand'],
            'product_name' : row['product_name'],
            'sales' : row['sales'],
            'quantity': row['quantity'],
            **current_sums,
            'count_tv_ads': count_tv,
            'count_programmatic_ads': count_programmatic,
            'ad_frequency_count': ad_frequency_count,
            'avg_time_between_ads_days': avg_time_between_ads,
            'avg_time_between_tv_ads_days': avg_time_between_tv_ads,
            'avg_time_between_programmatic_ads_days': avg_time_between_programmatic_ads
        })

        current_sums = {col: 0 for col in ad_columns}
        count_tv = 0
        count_programmatic = 0
        ad_timestamps = []
        tv_timestamps = []
        programmatic_timestamps = []

    else:
        for col in ad_columns:
            if pd.notna(row[col]):
                current_sums[col] += row[col]

        if row[ad_type_column] == 'programatic':
            count_programmatic += 1
            programmatic_timestamps.append(row['timestamp_utc'])
        elif row[ad_type_column] == 'tv':
            count_tv += 1
            tv_timestamps.append(row['timestamp_utc'])

        ad_timestamps.append(row['timestamp_utc'])

df_product_page = pd.DataFrame(order_data)


In [31]:
df_product_page.to_csv('../data/product_page_ads_history.csv')

In [29]:
df_product_page

Unnamed: 0,customer_id,timestamp_utc,event_name,brand,product_name,sales,quantity,cost_milli_cent_programmatic,campaign_name_Contextual,campaign_name_Retargeting,...,device_type_Phone,device_type_TV,device_type_Unknown,cost_milli_cent_tv,count_tv_ads,count_programmatic_ads,ad_frequency_count,avg_time_between_ads_days,avg_time_between_tv_ads_days,avg_time_between_programmatic_ads_days
0,re000kbtVVzPwZcEr4,2024-03-06 23:57:04,Product Page View,Science Diet,SD Pup SmPws Ckn 4.5lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
1,re000kbtVVzPwZcEr4,2024-04-15 15:04:05,Product Page View,Science Diet,SD Ca Adt SenSt&Sk Ckn 30lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
2,re000kbtVVzPwZcEr4,2024-04-15 16:15:59,Product Page View,Science Diet,SD Ca Adt SenSt&Sk Ckn 30lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
3,re000kbtVVzPwZcEr4,2024-04-15 16:33:34,Product Page View,Science Diet,SD Ca Adt SenSt&Sk Ckn 30lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
4,re000kbtVVzPwZcEr4,2024-04-15 19:05:05,Product Page View,Science Diet,SD Ca Adt SenSt&Sk Ckn 30lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3908049,rezzz8a320jhOvmL3A,2024-04-26 23:11:28,Product Page View,Science Diet,SD Ca Adt LB LM&BR 33lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
3908050,rezzz8a320jhOvmL3A,2024-04-26 23:40:20,Product Page View,Science Diet,SD Ca A6+ LB Ckn 15lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
3908051,rezzzYRiwreLF23ot3,2024-06-04 23:56:40,Product Page View,Science Diet,SD Ca Adt PerWgt Ckn SB 4lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,
3908052,rezzzZvkIaiWNQ1AmV,2024-04-16 22:01:01,Product Page View,Prescription Diet,PD m/d Feline 8.5lb bg,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0,0,0,,,


## Filtre

In [53]:
clean_agg = pd.read_csv('/Users/victor/code/escp/hackaton/data/clean_hills_data_without_unknown.csv')
clean_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1354583 entries, 0 to 1354582
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   customer_id                1354583 non-null  object 
 1   Add to cart                1354583 non-null  float64
 2   Order                      1354583 non-null  float64
 3   Product Page View          1354583 non-null  float64
 4   product_name               1330504 non-null  object 
 5   first_web_visit_timestamp  1354583 non-null  object 
 6   sales                      1354583 non-null  float64
 7   brand                      1330504 non-null  object 
 8   quantity                   1354583 non-null  float64
 9   total_website_interaction  1354583 non-null  float64
 10  breed                      1354583 non-null  object 
 11  age                        1354583 non-null  object 
 12  income                     1354583 non-null  object 
 13  Contextual  

In [54]:
clean_agg['total_ads_programmatic_seen'] = clean_agg['PC'] + clean_agg['Phone'] + clean_agg['Robot'] + clean_agg['TV'] + clean_agg['Unknown']

In [55]:
clean_agg = clean_agg.rename(columns={'device_id':'TV_ads', 'total_ads_tv_count':'total_cost_TV_ads'})

In [56]:
clean_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1354583 entries, 0 to 1354582
Data columns (total 26 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   customer_id                  1354583 non-null  object 
 1   Add to cart                  1354583 non-null  float64
 2   Order                        1354583 non-null  float64
 3   Product Page View            1354583 non-null  float64
 4   product_name                 1330504 non-null  object 
 5   first_web_visit_timestamp    1354583 non-null  object 
 6   sales                        1354583 non-null  float64
 7   brand                        1330504 non-null  object 
 8   quantity                     1354583 non-null  float64
 9   total_website_interaction    1354583 non-null  float64
 10  breed                        1354583 non-null  object 
 11  age                          1354583 non-null  object 
 12  income                       1354583 non-n

In [57]:
clean_agg.sort_values(by='total_website_interaction', ascending=False)

Unnamed: 0,customer_id,Add to cart,Order,Product Page View,product_name,first_web_visit_timestamp,sales,brand,quantity,total_website_interaction,...,Phone,Robot,TV,Unknown,total_ads_cost,first_ads_timestamp,TV_ads,total_cost_TV_ads,first_ads_tv_timestamp,total_ads_programmatic_seen
1217141,reth7mdoDJCbkDalaK,2051.0,2013.0,2126.0,SD Ca Adt SmPws Ckn 4.5lb bg,2024-01-06 06:18:06,47571.74,Science Diet,2120.0,6190.0,...,0.0,0.0,24.0,0.0,10210.488,2024-01-15 06:22:19,1.0,0.00,,24.0
426942,reJZhNiD0cXQyewGn3,1400.0,1259.0,2999.0,SD Pup SB Ckn 4.5lb bg,2024-01-01 11:04:37,28827.65,Science Diet,1321.0,5658.0,...,0.0,0.0,2.0,0.0,8942.652,2024-02-11 15:15:23,1.0,0.00,,3.0
380313,reHQRAI2c9b35Tij3O,1.0,1.0,2228.0,PD Ca i/d Ckn&VgStew 24x5.5oz cs,2024-01-02 17:56:51,36.49,Science Diet,1.0,2230.0,...,1.0,0.0,3.0,0.0,5286.136,2024-01-06 10:06:03,1.0,0.00,,9.0
1001731,rejr2YF7XyHh2RavMT,3.0,0.0,2078.0,PD Ca i/d 27.5lb bg,2024-01-02 17:12:38,0.00,Prescription Diet,0.0,2081.0,...,0.0,0.0,21.0,0.0,44694.653,2024-01-02 23:03:19,1.0,0.00,,21.0
798960,reacR0yQWqUW9qnbE8,2.0,0.0,1350.0,SD Ca Adt SenSt&Sk Ckn 30lb bg,2024-01-02 20:23:19,0.00,Science Diet,0.0,1352.0,...,1.0,0.0,30.0,0.0,70834.873,2024-01-11 21:33:53,1.0,0.00,,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519549,reNpJF8Zqlpq8k8yZF,0.0,1.0,0.0,SD Ca Adt SmPws Ckn 15.5lb bg,2024-05-27 19:39:47,55.99,Science Diet,1.0,1.0,...,,,,,,,1.0,2325.51,2024-05-25 19:43:22,
519554,reNpJu9G4NMt21YbVi,0.0,0.0,1.0,SD Ca Adt Lt SB Ckn 30lb bg,2024-04-03 04:13:38,0.00,Science Diet,0.0,1.0,...,0.0,0.0,1.0,0.0,237.971,2024-04-25 03:57:14,1.0,0.00,,1.0
1025281,rekw5tNPStMtGRAi50,0.0,0.0,1.0,,2024-02-02 16:29:29,0.00,,0.0,1.0,...,2.0,0.0,0.0,0.0,7995.317,2024-01-01 16:07:21,1.0,0.00,,12.0
1025280,rekw5l2xhat3WGcbtd,0.0,0.0,1.0,SD Fel Adt Oral Ckn 7lb bg,2024-01-28 22:07:23,0.00,Science Diet,0.0,1.0,...,0.0,0.0,0.0,0.0,2053.527,2024-01-28 22:45:33,1.0,0.00,,1.0


In [92]:
client_to_remove2 = clean_agg[
    (clean_agg['Order'] == 0) &
    (clean_agg['total_website_interaction']< clean_agg['total_website_interaction'].median()) &
    (clean_agg['total_ads_cost'] >= (clean_agg['total_ads_cost'].median())) &
    (clean_agg['total_cost_TV_ads'] >= (clean_agg['total_cost_TV_ads'].median()))
    ]

(client_to_remove2['total_cost_TV_ads'].sum() + client_to_remove2['total_ads_cost'].sum())/100000

9463.785050110002

In [93]:
len(client_to_remove2['customer_id'].to_list())

139616

In [94]:
client_rem_list = client_to_remove2['customer_id'].to_list()

In [95]:
client_to_promote = clean_agg[
    (clean_agg['Order'] == 0) &
    (clean_agg['total_website_interaction']>=clean_agg['total_website_interaction'].median()) &
    (clean_agg['total_ads_cost'] <= (clean_agg['total_ads_cost'].mean())) &
    (clean_agg['total_cost_TV_ads'] <= (clean_agg['total_cost_TV_ads'].mean()))
    ]

(client_to_promote['total_cost_TV_ads'].sum() + client_to_promote['total_ads_cost'].sum())/100000

1512.08360843

In [96]:
len(set(client_to_promote['customer_id'].to_list()))

94223

In [97]:
client_keep_list = client_to_promote['customer_id'].to_list()

In [98]:
df_client_to_target_less = clean_agg[clean_agg['customer_id'].isin(client_rem_list)]
df_client_to_target_more = clean_agg[clean_agg['customer_id'].isin(client_keep_list)]

In [100]:
len(df_client_to_target_more)

94223

In [103]:
df_client_to_target_less.to_csv('../data/customer_less_targetting.csv')

In [104]:
df_client_to_target_more.to_csv('../data/customer_more_targetting.csv')

In [74]:
df_client_to_target_more.head()

Unnamed: 0,customer_id,Add to cart,Order,Product Page View,product_name,first_web_visit_timestamp,sales,brand,quantity,total_website_interaction,...,Phone,Robot,TV,Unknown,total_ads_cost,first_ads_timestamp,TV_ads,total_cost_TV_ads,first_ads_tv_timestamp,total_ads_programmatic_seen
230,re00dd24Rh3U8BqHBm,0.0,0.0,10.0,SD Pup LB LM&BR 30lb bg,2024-05-08 21:18:58,0.0,Science Diet,0.0,10.0,...,0.0,0.0,1.0,0.0,3008.45,2024-04-21 18:37:34,1.0,0.0,,1.0
398,re016LDM75PuJUatzn,2.0,0.0,11.0,SD Ca A7+ SrVit Ckn&VgStew 12x12.5oz cs,2024-04-08 19:12:32,0.0,Science Diet,0.0,13.0,...,0.0,0.0,0.0,0.0,2578.082,2024-05-14 02:45:56,1.0,0.0,,1.0
601,re01gHUjmNQec0lyfU,0.0,0.0,17.0,SD Ca Adt SenSt&Sk Sm&Min Ckn 15lb bg,2024-03-02 16:00:16,0.0,Science Diet,0.0,17.0,...,2.0,0.0,0.0,0.0,1823.67,2024-01-14 16:25:39,1.0,0.0,,6.0
635,re01mZmnPbvIZW6asd,3.0,0.0,15.0,PD Canine k/d 8.5lb bg,2024-01-17 09:25:04,0.0,Prescription Diet,0.0,18.0,...,1.0,0.0,0.0,0.0,415.641,2024-01-31 22:22:51,1.0,0.0,,1.0
678,re01vhkYhckSVwRmir,2.0,0.0,19.0,PD Canine k/d 8.5lb bg,2024-01-02 21:04:34,0.0,Prescription Diet,0.0,21.0,...,0.0,0.0,0.0,0.0,976.937,2024-01-02 21:05:48,1.0,0.0,,1.0


In [77]:
df_client_to_target_more['total_ads_cost']/100000

230        0.030084
398        0.025781
601        0.018237
635        0.004156
678        0.009769
             ...   
1354076    0.021305
1354088    0.029467
1354296    0.027001
1354322    0.013377
1354354    0.030960
Name: total_ads_cost, Length: 14427, dtype: float64

In [101]:
len(df_client_to_target_more)

94223

In [102]:
len(df_client_to_target_less)

139616

### Stats conversion

In [105]:
len(df_retail)

6146286

In [106]:
df_retail = pd.read_csv('../data/retailer.csv')
programatic = pd.read_csv('../data/programmatic_publisher.csv')
map = pd.read_csv('../data/mapping_transac_publisher_tv.csv')
tv = pd.read_csv('../data/tv_publisher.csv')

In [107]:
cust_list_retail = set(df_retail['customer_id'].to_list())

In [108]:
map_cust = map[map['customer_id'].isin(cust_list_retail)]

In [110]:
map_prog = map_cust[(map_cust['customer_id']!='unknown') & (map_cust['dsp_id']!='unknown')].merge(programatic, how='left', on='dsp_id')

In [111]:
len(map_prog)/len(programatic)

0.2388458111240404

In [113]:
map_tv = map_cust[(map_cust['customer_id']!='unknown') & (map_cust['device_id']!='unknown')].merge(tv, how='left', on='device_id')

In [114]:
len(map_tv)/len(tv)

0.2647355054363784