#First time users stay time

以下の3通りに分けて初回起動ユーザーの平均起動時間を計算する
+ daily
+ weekly
+ monthly

##Input Parameters
+ DATE 集計期間の終わりの日
+ DEBUG 手動実行時のみTrue
+ FREQUENCY 実行頻度
+ BIGQUERY_PROJECT_ID bigqueryのプロジェクト名
+ BIGQUERY_DATASET bigqueryのデータセット
+ PACKAGE_NAME bigqueryのパッケージ名
+ OUTPUT_BIGQUERY_PROJECT_ID 出力先のBQのプロジェクト名
+ ISLATEST 最新の日付を対象にする場合はTrue,任意の日付を指定する場合はFalse

# Output Range
+ daily
DATEの1日前を対象に集計
 
  ex.DATE="2021-02-02"の場合は"2021-02-01を対象に集計"
+ weekly
DATEの1日前から7日を対象に集計

  ex.DATE="2021-02-22"の場合は"2021-02-15"から"2021-02-21を対象に集計"
+ monthly
DATEの1日前から1ヶ月を対象に集計

  ex.DATE="2021-02-01"の場合は"2021-01-01"から"2021-01-31"を対象に集計"

## Output Data
+ date　集計の開始日
+ android_first_users_stay_time	Android初回起動ユーザーの平均滞在時間
+ ios_first_users_stay_time	iOS初回起動ユーザーの平均滞在時間
+ all_first_users_stay_time 全ユーザーの平均滞在時間


# Parameters

In [41]:
DATE = "2020-01-01" # @param {type: "date"}
DEBUG = True # @param {type: "boolean"} 手動実行時のみTrueにする。Cloud FunctionsからFalseを渡される。
FREQUENCY = "monthly" # @param {type: "string"}
BIGQUERY_PROJECT_ID = "fl-komtar-herbert-offer" # @param {type: "string"}
BIGQUERY_DATASET = "analytics_211559993.events_*" # @param {type: "string"}
PACKAGE_NAME = "jp.co.hardoff.renk.app.offer" # @param {type: "string"}
OUTPUT_BIGQUERY_PROJECT_ID = "fl-komtar-analytics-dashboard" # @param {type: "string"}
IS_LATEST = False# @param {type:"boolean"}

# Constants

In [42]:
SESSION_TIMEOUT_MINS = 30 #@param {type:"number"}
METRICS_NAME = "first-time-users-stay-time"

# Version

In [43]:
VERSION = "7"

# Authorize

In [44]:
if DEBUG:
    from google.colab import auth
    auth.authenticate_user()

# Imports

In [45]:
import pandas as pd
import numpy as np
from datetime import timedelta,datetime
from pytz import timezone

# Get Input Datasets

## データの取得期間

In [46]:
if IS_LATEST:
  date = (datetime.now(timezone("Asia/Tokyo"))-timedelta(days=1))
else:
  date = datetime.strptime(DATE,"%Y-%m-%d") - timedelta(days=1)

if FREQUENCY == "daily":
  start = date
  end = date
elif FREQUENCY =="weekly":
  start = date-timedelta(days=6)
  end = date 
elif FREQUENCY == "monthly":
  end = date
  start = datetime(end.year,end.month,1)
else:
  raise Exception("Invalid frequency value")
start_date = start.strftime("%Y%m%d")
end_date = end.strftime("%Y%m%d")
base_start = start-timedelta(days=1)
base_end = end+timedelta(days=1)
start_date, end_date

('20191201', '20191231')

## GBQからデータを取得する

In [47]:
duration_func = """
var current = arr[0];
var result = [];
for (var i = 0; i < arr.length - 1; i ++) {
  var diff = arr[i + 1] - arr[i];
  if ((diff / 1000000) >= 1800) {
    result.push({
      event_timestamp: current,
      duration: (arr[i] - current) / 1000000,
    });
    current = arr[i + 1];       
  }
}
result.push({
  event_timestamp: current,
  duration: (arr[arr.length - 1] - current) / 1000000,
});
result = result.filter((r) => r.duration > 0)
return result;
"""
query = f"""
  CREATE TEMP FUNCTION
  DURATION(arr ARRAY<INT64>)
  RETURNS ARRAY<STRUCT<event_timestamp INT64,
  duration INT64>>
  LANGUAGE js AS '''
  {duration_func}
  '''; 
  WITH
  base_table as (
    SELECT DISTINCT FORMAT_TIMESTAMP("%Y%m%d", TIMESTAMP_MICROS(event_timestamp), 'Asia/Tokyo') AS JST,user_pseudo_id,platform, event_timestamp,user_first_touch_timestamp
    FROM `{BIGQUERY_PROJECT_ID}.{BIGQUERY_DATASET}` 
    WHERE app_info.id like "{PACKAGE_NAME}%"  
    AND _table_suffix BETWEEN "{base_start.strftime("%Y%m%d")}" AND "{base_end.strftime("%Y%m%d")}"
    ),
    event_arr AS (
        SELECT
        DATE(TIMESTAMP_MICROS(event_timestamp), "Asia/Tokyo") AS date,
        user_pseudo_id,
        UPPER(platform)　as OS,
        user_first_touch_timestamp,
        DURATION(ARRAY_AGG(event_timestamp ORDER BY event_timestamp)) AS duration_arr
    FROM (
    SELECT * FROM base_table
    WHERE JST BETWEEN "{start_date}" AND "{end_date}"
    UNION ALL
    SELECT * FROM base_table 
    WHERE JST BETWEEN "{start_date}" AND "{end_date}" 
    )
  GROUP BY
    date,
    user_pseudo_id,
    user_first_touch_timestamp,
    OS ),
  events AS (
    SELECT
      date,
      TIMESTAMP_MICROS(event_timestamp) AS Time,
      user_pseudo_id,
      user_first_touch_timestamp,
      TIMESTAMP_MICROS(user_first_touch_timestamp) AS First_touch_Time,
      OS,
      darr.event_timestamp,
      darr.duration
    FROM
      event_arr
    CROSS JOIN UNNEST(event_arr.duration_arr) AS darr
  )
  SELECT *
  FROM
  events
  ORDER BY user_pseudo_id 
"""
df_session_duration = pd.DataFrame(columns = ['date', 'Time', 'user_pseudo_id', 'user_first_touch_timestamp','First_touch_Time', 'OS', 'event_timestamp', 'duration'])
df_gbq = pd.io.gbq.read_gbq(query, project_id = BIGQUERY_PROJECT_ID)
df_session_duration = pd.concat([df_session_duration, df_gbq])

df_session_duration = df_session_duration[(df_session_duration["date"] >= start_date) & (df_session_duration["date"] <= end_date)]
df_session_duration

Unnamed: 0,date,Time,user_pseudo_id,user_first_touch_timestamp,First_touch_Time,OS,event_timestamp,duration
0,2019-12-01,2019-11-30 22:31:36.621000+00:00,0011eaf47c077feeff7bb38a729e22b7,1574332514097000,2019-11-21 10:35:14.097000+00:00,ANDROID,1575153096621000,324
1,2019-12-01,2019-12-01 08:45:10.220000+00:00,0011eaf47c077feeff7bb38a729e22b7,1574332514097000,2019-11-21 10:35:14.097000+00:00,ANDROID,1575189910220000,497
2,2019-12-04,2019-12-04 01:34:43.038000+00:00,0011eaf47c077feeff7bb38a729e22b7,1574332514097000,2019-11-21 10:35:14.097000+00:00,ANDROID,1575423283038000,10
3,2019-12-04,2019-12-04 10:22:55.925000+00:00,0011eaf47c077feeff7bb38a729e22b7,1574332514097000,2019-11-21 10:35:14.097000+00:00,ANDROID,1575454975925000,112
4,2019-12-03,2019-12-02 15:05:56.330000+00:00,0011eaf47c077feeff7bb38a729e22b7,1574332514097000,2019-11-21 10:35:14.097000+00:00,ANDROID,1575299156330000,7
...,...,...,...,...,...,...,...,...
29580,2019-12-25,2019-12-25 09:01:32.642000+00:00,ff42bb689f989b49bb0b522942627d94,1572590773186000,2019-11-01 06:46:13.186000+00:00,ANDROID,1577264492642000,125
29581,2019-12-01,2019-12-01 04:27:34.036000+00:00,ffb37e2bc44e6c6a832b3d1056871462,1574162881095000,2019-11-19 11:28:01.095000+00:00,ANDROID,1575174454036000,1
29582,2019-12-31,2019-12-31 11:57:22.906000+00:00,ffb37e2bc44e6c6a832b3d1056871462,1574162881095000,2019-11-19 11:28:01.095000+00:00,ANDROID,1577793442906000,10
29583,2019-12-04,2019-12-04 08:25:42.985000+00:00,ffb37e2bc44e6c6a832b3d1056871462,1574162881095000,2019-11-19 11:28:01.095000+00:00,ANDROID,1575447942985000,2


## 期間内の初回起動者を取得

In [48]:
unique_first_users = df_session_duration[df_session_duration["Time"]==df_session_duration["First_touch_Time"]]["user_pseudo_id"].unique()
unique_first_users

array(['00230558A7FF4F399B1D98EFDC358FFC',
       '0042D7600F454FBBADBCE1C245F2DF8B',
       '0046f44fe541de8c3182502667310e19', ...,
       'fee0e76dd28a262688165434565ad24f',
       'ff3263fe3b9e3fffbf06fd1b57db68e2',
       'ffeb5006b1e331d26dbd6bedfeda225d'], dtype=object)

## 期間内のユニークなユーザーの行取得

In [49]:
df_first_users_duration = df_session_duration[df_session_duration["user_pseudo_id"].isin(unique_first_users)]
df_first_users_duration

Unnamed: 0,date,Time,user_pseudo_id,user_first_touch_timestamp,First_touch_Time,OS,event_timestamp,duration
23,2019-12-05,2019-12-04 22:29:52.195001+00:00,00230558A7FF4F399B1D98EFDC358FFC,1575251548873000,2019-12-02 01:52:28.873000+00:00,IOS,1575498592195001,1606
24,2019-12-05,2019-12-05 00:29:32.215001+00:00,00230558A7FF4F399B1D98EFDC358FFC,1575251548873000,2019-12-02 01:52:28.873000+00:00,IOS,1575505772215001,7
25,2019-12-05,2019-12-05 03:15:06.412001+00:00,00230558A7FF4F399B1D98EFDC358FFC,1575251548873000,2019-12-02 01:52:28.873000+00:00,IOS,1575515706412001,6
26,2019-12-05,2019-12-05 05:55:00.408000+00:00,00230558A7FF4F399B1D98EFDC358FFC,1575251548873000,2019-12-02 01:52:28.873000+00:00,IOS,1575525300408000,4
27,2019-12-05,2019-12-05 08:35:40.019001+00:00,00230558A7FF4F399B1D98EFDC358FFC,1575251548873000,2019-12-02 01:52:28.873000+00:00,IOS,1575534940019001,4
...,...,...,...,...,...,...,...,...
29570,2019-12-09,2019-12-09 05:25:15.880000+00:00,fee0e76dd28a262688165434565ad24f,1575869115880000,2019-12-09 05:25:15.880000+00:00,ANDROID,1575869115880000,17
29571,2019-12-09,2019-12-09 08:51:06.358000+00:00,fee0e76dd28a262688165434565ad24f,1575869115880000,2019-12-09 05:25:15.880000+00:00,ANDROID,1575881466358000,80
29572,2019-12-26,2019-12-25 18:20:54.756000+00:00,fee0e76dd28a262688165434565ad24f,1575869115880000,2019-12-09 05:25:15.880000+00:00,ANDROID,1577298054756000,2
29576,2019-12-07,2019-12-06 23:53:23.018000+00:00,ff3263fe3b9e3fffbf06fd1b57db68e2,1575676403018000,2019-12-06 23:53:23.018000+00:00,ANDROID,1575676403018000,16


# All Device Duration

In [50]:
session_timeout = pd.Timedelta('%d min' % SESSION_TIMEOUT_MINS).total_seconds()
session_timeout

1800.0

In [51]:
session_duration_list = []
# ユーザ毎に滞在時間を求める。
for g_user_id, df_g_user in df_first_users_duration.groupby(["user_pseudo_id","OS"]):
  previous_row = None
  session_duration = pd.Timedelta('0 min')
# 各ユーザのイベントを1つ1つ確認する。
  for row in df_g_user.itertuples():
    # 一番初めのイベントの値を保存する。
    if previous_row is None:
      previous_row = row
      session_duration = pd.Timedelta(seconds = previous_row.duration)
      continue
    # 前後のイベント間隔が30分以上ひらいた時、滞在時間を求める。
    diff_session = datetime.fromtimestamp(row.event_timestamp/1000000)-datetime.fromtimestamp(previous_row.event_timestamp/1000000) #追加
    diff_session = diff_session.total_seconds() #追加
    if diff_session > session_timeout:
      session_duration_list.append({
          "date": previous_row.date, 
          "event_timestamp": previous_row.event_timestamp,
          "user_pseudo_id": g_user_id[0],
          "OS" : g_user_id[1], 
          "session_duration": session_duration,
      })
      session_duration = pd.Timedelta(seconds = row.duration)
    else:
        # 30分以下の場合は同じセッションに該当するとみなす。
      session_duration += pd.Timedelta(seconds = row.duration)
    previous_row = row
    # 一番最後のイベントを計測する。
  session_duration = pd.Timedelta(seconds = row.duration)
  session_duration_list.append({
      "date" : previous_row.date, 
      "event_timestamp": previous_row.event_timestamp,
      "user_pseudo_id" : g_user_id[0],
      "OS" : g_user_id[1], 
      "session_duration": session_duration,
  })
df_session_duration_list = pd.DataFrame(session_duration_list)
df_session_duration_list = df_session_duration_list.sort_values("date").reset_index(drop=True)
df_session_duration_list["session_duration"] = (df_session_duration_list.session_duration.astype(np.int64) / 1000000000).astype("int64")
df_session_duration_list.sort_values("user_pseudo_id")

Unnamed: 0,date,event_timestamp,user_pseudo_id,OS,session_duration
293,2019-12-02,1575265104849000,00230558A7FF4F399B1D98EFDC358FFC,IOS,10
8638,2019-12-23,1577067092087000,00230558A7FF4F399B1D98EFDC358FFC,IOS,304
8623,2019-12-23,1577061940197001,00230558A7FF4F399B1D98EFDC358FFC,IOS,3227
294,2019-12-02,1575269227588000,00230558A7FF4F399B1D98EFDC358FFC,IOS,1275
10292,2019-12-26,1577313581400000,00230558A7FF4F399B1D98EFDC358FFC,IOS,10
...,...,...,...,...,...
2731,2019-12-09,1575869115880000,fee0e76dd28a262688165434565ad24f,ANDROID,856
10248,2019-12-26,1577298054756000,fee0e76dd28a262688165434565ad24f,ANDROID,2
2732,2019-12-09,1575881466358000,fee0e76dd28a262688165434565ad24f,ANDROID,80
1759,2019-12-07,1575676403018000,ff3263fe3b9e3fffbf06fd1b57db68e2,ANDROID,16


In [52]:
all_first_users_stay_time = df_session_duration_list["session_duration"].mean()
all_first_users_stay_time

489.54651919323356

# Android

In [53]:
df_android_users = df_session_duration_list[df_session_duration_list["OS"]=="ANDROID"]
df_android_users

Unnamed: 0,date,event_timestamp,user_pseudo_id,OS,session_duration
2,2019-12-01,1575175402590000,3cccd2c1ff74ffec1e0725e92fb14778,ANDROID,5463
3,2019-12-01,1575184582984000,3cccd2c1ff74ffec1e0725e92fb14778,ANDROID,3688
5,2019-12-01,1575198908012000,0e53fbc76e24d1058fcc6c2e1591a3f1,ANDROID,69
6,2019-12-01,1575169076306000,0dfa16f07929fe68a463c69624813e5c,ANDROID,112
7,2019-12-01,1575190540599000,048f18deb743521dea454f00382d0442,ANDROID,485
...,...,...,...,...,...
12289,2019-12-31,1577752523036000,5a197a8b0116d704070f870d09a560c2,ANDROID,760
12290,2019-12-31,1577759736682000,5a197a8b0116d704070f870d09a560c2,ANDROID,1887
12291,2019-12-31,1577769605370000,5a197a8b0116d704070f870d09a560c2,ANDROID,2942
12292,2019-12-31,1577753418909000,5a6a95d72f42f8638c9c8d58bc5fb92e,ANDROID,15


In [54]:
android_avg_session_duration = df_android_users["session_duration"].mean()
android_avg_session_duration

545.6745667686034

# IOS

In [55]:
df_ios_users = df_session_duration_list[df_session_duration_list["OS"]=="IOS"]
df_ios_users

Unnamed: 0,date,event_timestamp,user_pseudo_id,OS,session_duration
0,2019-12-01,1575208062892000,8ED62920A8794BB78270156546EF2B52,IOS,1372
1,2019-12-01,1575194635635000,E1D03428BA964B18941DF9506D376861,IOS,559
4,2019-12-01,1575171234369000,6586AF7B0E914DC09A46E4D5455F055D,IOS,511
13,2019-12-01,1575181462414001,DADFE3CF76D343CD8ACC76BD398C4212,IOS,209
14,2019-12-01,1575168623672000,DADFE3CF76D343CD8ACC76BD398C4212,IOS,2105
...,...,...,...,...,...
12285,2019-12-31,1577723361002000,BC9B332D06384C459CEFEF3657B0235E,IOS,531
12286,2019-12-31,1577776573980000,BD182B931CBA4CB9B5C0EE329CF4A1E2,IOS,16
12287,2019-12-31,1577780025933000,BD182B931CBA4CB9B5C0EE329CF4A1E2,IOS,5
12294,2019-12-31,1577782156933000,5E14F2A710014F7F8FB55BDEC05E638F,IOS,319


In [56]:
ios_avg_session_duration = df_ios_users["session_duration"].mean()
ios_avg_session_duration

463.239010989011

# Output

In [57]:
df_output = pd.DataFrame(columns=["android_first_users_stay_time","ios_first_users_stay_time","all_first_users_stay_time"],index=[0])
df_output.insert(0, "date", start.strftime(format="%Y-%m-%d"))
df_output["date"] = pd.to_datetime(df_output["date"], format="%Y-%m-%d").dt.date
df_output["android_first_users_stay_time"] = android_avg_session_duration
df_output["ios_first_users_stay_time"] = ios_avg_session_duration
df_output["all_first_users_stay_time"] = all_first_users_stay_time
df_output = df_output.round(3)
df_output.to_gbq(f"""{PACKAGE_NAME.replace(".","_")}_{METRICS_NAME.replace("-","_")}.{FREQUENCY}_events_{start.strftime(format="%Y-%m-%d").replace("-","")}""",
                 if_exists="replace",
                 table_schema=[{'name': 'date','type': 'DATE'},
                               {'name': 'android_first_users_stay_time','type': 'FLOAT64'},
                               {'name': 'ios_first_users_stay_time','type': 'FLOAT64'},
                               {'name': 'all_first_users_stay_time','type': 'FLOAT64'}
                               ],
                 project_id=OUTPUT_BIGQUERY_PROJECT_ID)
df_output

1it [00:04,  4.28s/it]


Unnamed: 0,date,android_first_users_stay_time,ios_first_users_stay_time,all_first_users_stay_time
0,2019-12-01,545.675,463.239,489.547
