In [223]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [238]:
data_path = '.'
costs_df = pd.read_csv(os.path.join(data_path, 'Costs.csv'), parse_dates=['date_created'], dayfirst=False)
relations_df = pd.read_csv(os.path.join(data_path, 'Relations.csv'), usecols=['name', 'chanel', 'id_partner'])
users_df = pd.read_csv(os.path.join(data_path, 'Users.csv'), index_col=0, 
                       usecols=['id', 'Reg_date', 'name', 'id_partner'], parse_dates=['Reg_date'], dayfirst=False)
visits_df = pd.read_csv(os.path.join(data_path, 'Visits.csv'), parse_dates=['Visit_date'], dayfirst=False)
orders_df = pd.read_csv(os.path.join(data_path, 'Orders.csv'), parse_dates=['Order Date'], dayfirst=False)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


### Explore data

In [246]:
def explore_df(df):
    print(df.head())
    print(df.dtypes)
    null_cols = df.columns[df.isnull().any()].values
    if(len(null_cols)):
        print('Columns with nulls: {}'.format(null_cols))

#### Users

In [252]:
explore_df(users_df)

          Reg_date id_partner                         name  chanel
id                                                                
6745955 2017-10-04     rt_DBM        ${INSERTION_ORDER_ID}  RT DBM
5159878 2017-07-01        DBM  %24%7BINSERTION_ORDER_ID%7D     DBM
5159881 2017-07-01        DBM  %24%7BINSERTION_ORDER_ID%7D     DBM
5159940 2017-07-01        DBM  %24%7BINSERTION_ORDER_ID%7D     DBM
5295496 2017-07-08        DBM  %24%7BINSERTION_ORDER_ID%7D     DBM
Reg_date      datetime64[ns]
id_partner            object
name                  object
chanel                object
dtype: object


#### Orders

In [251]:
explore_df(orders_df)

   id_user          Order Date   Amount  Order ID
0  4618612 2017-08-17 02:08:40   2.6312  23732390
1  4618612 2017-08-17 02:29:26  17.5912  23732304
2  4618645 2017-06-01 04:39:41   8.7912  12441805
3  4618815 2017-06-01 02:11:01   2.6312  12439685
4  4618982 2017-10-04 03:43:24   8.7912  25568622
id_user                int64
Order Date    datetime64[ns]
Amount               float64
Order ID               int64
dtype: object


#### Relations

In [250]:
explore_df(relations_df)

  chanel id_partner name
0      7          7   19
1      7          7   81
2      7          7  108
3      7          7  112
4      7          7  131
chanel        object
id_partner    object
name          object
dtype: object


#### Visits

In [249]:
explore_df(visits_df)

   id_user          Visit_date
0  6362904 2017-09-15 10:55:42
1  6362904 2017-09-18 07:35:11
2  7145363 2017-11-05 19:37:53
3  7270453 2017-11-15 17:06:20
4  7270453 2017-11-15 17:12:46
id_user                int64
Visit_date    datetime64[ns]
dtype: object


#### Costs

In [248]:
explore_df(costs_df)

       Costs id_partner campaign date_created
0  15.701622        NaN  4147836   2018-01-05
1  17.163542        NaN  4157838   2018-01-05
2  17.408304        NaN  4164809   2018-01-05
3   0.000000          0  2008634   2017-09-01
4   0.000000          0  2008634   2017-09-02
Costs                  float64
id_partner              object
campaign                object
date_created    datetime64[ns]
dtype: object
Columns with nulls: ['id_partner' 'campaign']


## Задание №1:	
## С помощью инструментов R или Python посчитать следующие метрики:

### ROI для 0-го, 6-го 14-го и 30-го дней жизни пользователя на сайте в разрезе маркетинговых каналов (chanel из relations).

#### Calculate marketing costs per day

In [239]:
users_df = users_df.reset_index().merge(relations_df, on=['name', 'id_partner']).set_index('id')
users_df.head()

Unnamed: 0_level_0,Reg_date,id_partner,name,chanel
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6745955,2017-10-04,rt_DBM,${INSERTION_ORDER_ID},RT DBM
5159878,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM
5159881,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM
5159940,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM
5295496,2017-07-08,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM


In [228]:
costs_per_date = costs_df[['date_created','Costs']].groupby('date_created').agg(np.sum)['Costs']

In [229]:
num_users_per_date = users_df.groupby('Reg_date')['id_partner'].count()
cost_per_user_per_date = costs_per_date[num_users_per_date.index] / num_users_per_date
users_df['cost_per_reg'] = cost_per_user_per_date[users_df['Reg_date'].values].values

In [230]:
orders_user_reg_df = orders_df.merge(users_df, how='left', left_on='id_user', right_index=True)
orders_user_reg_df.head()

Unnamed: 0,id_user,Order Date,Amount,Order ID,Reg_date,id_partner,name,chanel,cost_per_reg
0,4618612,2017-08-17 02:08:40,2.6312,23732390,2017-06-01,DBM_mob,3260707,DBM_mob,6.035521
1,4618612,2017-08-17 02:29:26,17.5912,23732304,2017-06-01,DBM_mob,3260707,DBM_mob,6.035521
2,4618645,2017-06-01 04:39:41,8.7912,12441805,2017-06-01,vh_s,Adw_VH_Tier2_S_Key_Brides_Old,VH Search Desktop tier2,6.035521
3,4618815,2017-06-01 02:11:01,2.6312,12439685,2017-06-01,1020,l6507,LosPollos,6.035521
4,4618982,2017-10-04 03:43:24,8.7912,25568622,2017-06-01,vh_uu,Adw_VH_GSP_Similar,VH Desktop GSP USA,6.035521


In [231]:
def roi_on_day(day, users_df, orders_user_reg_df):
    users_ids = users_df[users_df['Reg_date']<np.max(users_df['Reg_date']) + np.timedelta64(-day - 1, 'D')].index
    orders_for_roi_mask = orders_user_reg_df['Order Date'] < orders_user_reg_df['Reg_date'] + np.timedelta64(day + 1, 'D')
    orders_for_roi_mask = orders_for_roi_mask & orders_user_reg_df['id_user'].isin(users_ids)
    cost_order_amount_per_user_df = orders_user_reg_df[orders_for_roi_mask].groupby('id_user').agg(
        {'Amount': 'sum'})

    order_amount_day_label = 'order_amount_day_' + str(day)
    users_df[order_amount_day_label] = cost_order_amount_per_user_df['Amount']
    users_df[order_amount_day_label].fillna(0, inplace=True)
    
    cost_amout_per_channel_df = users_df.groupby('chanel').agg({'cost_per_reg': 'sum', order_amount_day_label: 'sum'})
    cost_amout_per_channel_df['roi'] = 100 * (cost_amout_per_channel_df[order_amount_day_label] 
        - cost_amout_per_channel_df['cost_per_reg'])/cost_amout_per_channel_df['cost_per_reg']
    return cost_amout_per_channel_df

In [232]:
roi = roi_on_day(0, users_df, orders_user_reg_df)
roi['roi_14'] = roi_on_day(14, users_df, orders_user_reg_df)['roi']
# cost_amout_per_channel_df['roi'].hist()
roi['roi_30'] = cost_amout_per_channel_df = roi_on_day(30, users_df, orders_user_reg_df)['roi']
roi.describe()
# np.max(cost_amout_per_channel_df['roi'])
# print('ROI 14 day: {}'.format(order_amount_days_after_reg(14))   
# print('ROI 30 day: {}'.format(order_amount_days_after_reg(30))            

Unnamed: 0,cost_per_reg,order_amount_day_0,roi,roi_14,roi_30
count,364.0,364.0,364.0,364.0,364.0
mean,42169.657982,474.849402,-98.610016,-86.495664,-77.234316
std,102948.995889,1135.506884,3.287037,25.54893,42.840748
min,4.829477,0.0,-100.0,-100.0,-100.0
25%,830.019881,0.0,-100.0,-100.0,-100.0
50%,7650.21511,41.3336,-99.664024,-96.356723,-93.926845
75%,34746.05386,388.5332,-98.743669,-85.183704,-74.337571
max,828510.346466,10787.744,-69.269694,110.141282,228.303274


In [233]:
print(roi.nlargest(10, 'roi')['roi'])
print(roi.nlargest(10, 'roi')['roi_14'])
print(roi.nlargest(10, 'roi')['roi_30'])

chanel
Zero                             -69.269694
VH Bing Desktop top3 Ethnic      -73.794870
RT Search Desktop FR Ethnic      -77.265383
VH Search Desktop top2 Ethnic    -83.999327
VH Search Desktop top3 Ethnic    -84.194464
VH Bing Desktop top2 Ethnic      -84.680172
VH Search Desktop tier1 Ethnic   -88.764662
VH Search Desktop top6 Ethnic    -89.750513
PPS-Offer                        -89.873238
RT Search Desktop Brand ads      -90.177787
Name: roi, dtype: float64
chanel
Zero                              110.141282
VH Bing Desktop top3 Ethnic      -100.000000
RT Search Desktop FR Ethnic        17.376328
VH Search Desktop top2 Ethnic      60.965295
VH Search Desktop top3 Ethnic      19.685374
VH Bing Desktop top2 Ethnic       -78.016913
VH Search Desktop tier1 Ethnic     23.058240
VH Search Desktop top6 Ethnic      10.874666
PPS-Offer                         -40.769250
RT Search Desktop Brand ads       -46.961873
Name: roi_14, dtype: float64
chanel
Zero                              

In [235]:
def cost_user_paid_on_day(day, users_df, orders_user_reg_df):
    users_ids = users_df[users_df['Reg_date']<np.max(users_df['Reg_date']) + np.timedelta64(-day - 1, 'D')].index
    orders_for_roi_mask = orders_user_reg_df['Order Date'] < orders_user_reg_df['Reg_date'] + np.timedelta64(day + 1, 'D')
    orders_for_roi_mask = orders_for_roi_mask & orders_user_reg_df['id_user'].isin(users_ids)
    cost_order_amount_per_user_df = orders_user_reg_df[orders_for_roi_mask].groupby('id_user').agg(
        {'cost_per_reg': 'first'})

#     order_amount_day_label = 'order_amount_day_' + str(day)
#     users_df[order_amount_day_label] = cost_order_amount_per_user_df['Amount']
#     users_df[order_amount_day_label].fillna(0, inplace=True)
    
    cost_amout_per_partner_df = users_df.groupby('id_partner').agg({'cost_per_reg': 'mean'})
    print(cost_amout_per_partner_df.head())
#     cost_amout_per_channel_df['roi'] = 100 * (cost_amout_per_channel_df[order_amount_day_label] 
#         - cost_amout_per_channel_df['cost_per_reg'])/cost_amout_per_channel_df['cost_per_reg']
#     return cost_amout_per_channel_df

In [236]:
cost_on_day = cost_user_paid_on_day(0, users_df, orders_user_reg_df)

                cost_per_reg
id_partner                  
0                   5.689981
10                 10.842643
1020                5.715563
1020_mobile         5.710902
1020_not_valid      5.908413


In [240]:
def pct_users_return_first_week(day, visits_users_df):
    visits_mask = visits_users_df['Visit_date'] < visits_users_df['Reg_date'] + np.timedelta64(day + 1, 'D')
    cost_order_amount_per_user_df = visits_users_df[visits_mask].groupby('id_user').agg(
        {'Visit_date': lambda x: print(x)})

#     order_amount_day_label = 'order_amount_day_' + str(day)
#     users_df[order_amount_day_label] = cost_order_amount_per_user_df['Amount']
#     users_df[order_amount_day_label].fillna(0, inplace=True)
    
    cost_amout_per_partner_df = users_df.groupby('id_partner').agg({'cost_per_reg': 'mean'})
    print(cost_amout_per_partner_df.head())
#     cost_amout_per_channel_df['roi'] = 100 * (cost_amout_per_channel_df[order_amount_day_label] 
#         - cost_amout_per_channel_df['cost_per_reg'])/cost_amout_per_channel_df['cost_per_reg']
#     return cost_amout_per_channel_df

In [254]:
visits_users_df = visits_df.sort_values(by='Visit_date').drop_duplicates('Visit_date', keep='first')
visits_users_df = users_df.merge(visits_df, how='left', right_on='id_user', left_index=True)
pct_users_return_first_week(7, visits_users_df[:10], users_df)

Unnamed: 0,Reg_date,id_partner,name,chanel,id_user,Visit_date
26902908,2017-10-04,rt_DBM,${INSERTION_ORDER_ID},RT DBM,6745955,2017-10-04 03:27:02
8327163,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM,5159878,2017-07-01 10:40:57
8327165,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM,5159881,2017-07-04 05:56:17
8327166,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM,5159881,2017-07-04 10:48:26
8327167,2017-07-01,DBM,%24%7BINSERTION_ORDER_ID%7D,DBM,5159940,2017-07-01 10:46:08


#### Determine that Reg_date is not accounted as the first record in the Visit_date

In [274]:
(visits_users_df['Visit_date'].dt.normalize() == visits_users_df['Reg_date'].dt.normalize()).value_counts()

False    12833766
True      1818405
dtype: int64