In [183]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [184]:
data_path = '.'
costs_df = pd.read_csv(os.path.join(data_path, 'Costs.csv'), parse_dates=['date_created'], dayfirst=False)
relations_df = pd.read_csv(os.path.join(data_path, 'Relations.csv'), usecols=['name', 'chanel', 'id_partner'])
# relations_df.index = relations_df.index.map(str)
users_df = pd.read_csv(os.path.join(data_path, 'Users.csv'), index_col=0, 
                       usecols=['id', 'Reg_date', 'name', 'id_partner'], parse_dates=['Reg_date'], dayfirst=False)
visits_df = pd.read_csv(os.path.join(data_path, 'Visits.csv'), parse_dates=['Visit_date'], dayfirst=False)
orders_df = pd.read_csv(os.path.join(data_path, 'Orders.csv'), parse_dates=['Order Date'], dayfirst=False)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


## Explore data

In [185]:
def explore_df(df):
    print(df.head())
    print(df.dtypes)
    null_cols = df.columns[df.isnull().any()].values
    if(len(null_cols)):
        print('Columns with nulls: {}'.format(null_cols))
    print()

In [186]:
for df in [costs_df, relations_df, users_df, visits_df, orders_df]:
    explore_df(df)

       Costs id_partner campaign date_created
0  15.701622        NaN  4147836   2018-01-05
1  17.163542        NaN  4157838   2018-01-05
2  17.408304        NaN  4164809   2018-01-05
3   0.000000          0  2008634   2017-09-01
4   0.000000          0  2008634   2017-09-02
Costs                  float64
id_partner              object
campaign                object
date_created    datetime64[ns]
dtype: object
Columns with nulls: ['id_partner' 'campaign']

  chanel id_partner name
0      7          7   19
1      7          7   81
2      7          7  108
3      7          7  112
4      7          7  131
chanel        object
id_partner    object
name          object
dtype: object

          Reg_date id_partner                         name
id                                                        
6745955 2017-10-04     rt_DBM        ${INSERTION_ORDER_ID}
5159878 2017-07-01        DBM  %24%7BINSERTION_ORDER_ID%7D
5159881 2017-07-01        DBM  %24%7BINSERTION_ORDER_ID%7D
5159940 2017-07-

## Задание №1:	
## С помощью инструментов R или Python посчитать следующие метрики:

### ROI для 0-го, 6-го 14-го и 30-го дней жизни пользователя на сайте в разрезе маркетинговых каналов (chanel из relations).

#### Calculate marketing costs per day

In [187]:
users_df = users_df.reset_index().merge(relations_df, on=['name', 'id_partner']).set_index('id')

In [188]:
costs_per_date = costs_df[['date_created','Costs']].groupby('date_created').agg(np.sum)['Costs']

In [189]:
num_users_per_date = users_df.groupby('Reg_date')['id_partner'].count()
cost_per_user_per_date = costs_per_date[num_users_per_date.index] / num_users_per_date
users_df['cost_per_reg'] = cost_per_user_per_date[users_df['Reg_date'].values].values

In [190]:
orders_user_reg_df = orders_df.merge(users_df, how='left', left_on='id_user', right_index=True)
orders_user_reg_df.head()

Unnamed: 0,id_user,Order Date,Amount,Order ID,Reg_date,id_partner,name,chanel,cost_per_reg
0,4618612,2017-08-17 02:08:40,2.6312,23732390,2017-06-01,DBM_mob,3260707,DBM_mob,6.035521
1,4618612,2017-08-17 02:29:26,17.5912,23732304,2017-06-01,DBM_mob,3260707,DBM_mob,6.035521
2,4618645,2017-06-01 04:39:41,8.7912,12441805,2017-06-01,vh_s,Adw_VH_Tier2_S_Key_Brides_Old,VH Search Desktop tier2,6.035521
3,4618815,2017-06-01 02:11:01,2.6312,12439685,2017-06-01,1020,l6507,LosPollos,6.035521
4,4618982,2017-10-04 03:43:24,8.7912,25568622,2017-06-01,vh_uu,Adw_VH_GSP_Similar,VH Desktop GSP USA,6.035521


In [214]:
def roi_on_day(day, users_df, orders_user_reg_df):
    users_ids = users_df[users_df['Reg_date']<np.max(users_df['Reg_date']) + np.timedelta64(-day - 1, 'D')].index
    orders_for_roi_mask = orders_user_reg_df['Order Date'] < orders_user_reg_df['Reg_date'] + np.timedelta64(day + 1, 'D')
    orders_for_roi_mask = orders_for_roi_mask & orders_user_reg_df['id_user'].isin(users_ids)
    cost_order_amount_per_user_df = orders_user_reg_df[orders_for_roi_mask].groupby('id_user').agg(
        {'Amount': 'sum'})

    order_amount_day_label = 'order_amount_day_' + str(day)
    users_df[order_amount_day_label] = cost_order_amount_per_user_df['Amount']
    users_df[order_amount_day_label].fillna(0, inplace=True)
    
    cost_amout_per_channel_df = users_df.groupby('chanel').agg({'cost_per_reg': 'sum', order_amount_day_label: 'sum'})
    cost_amout_per_channel_df['roi'] = 100 * (cost_amout_per_channel_df[order_amount_day_label] 
        - cost_amout_per_channel_df['cost_per_reg'])/cost_amout_per_channel_df['cost_per_reg']
    return cost_amout_per_channel_df

In [217]:
roi = roi_on_day(0, users_df, orders_user_reg_df)
roi['roi_14'] = roi_on_day(14, users_df, orders_user_reg_df)['roi']
# cost_amout_per_channel_df['roi'].hist()
roi['roi_30'] = cost_amout_per_channel_df = roi_on_day(30, users_df, orders_user_reg_df)['roi']
cost_amout_per_channel_df['roi'].describe()
# np.max(cost_amout_per_channel_df['roi'])
# print('ROI 14 day: {}'.format(order_amount_days_after_reg(14))   
# print('ROI 30 day: {}'.format(order_amount_days_after_reg(30))            

count    364.000000
mean     -77.234316
std       42.840748
min     -100.000000
25%     -100.000000
50%      -93.926845
75%      -74.337571
max      228.303274
Name: roi, dtype: float64