In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## 4. 지표 모니터링을 위한 데이터 시각화
### 1) 데이터 추출
- SQL을 활용해 DB에서 데이터를 추출
### 2) 데이터 전처리
- SQL 또는 Python을 활용해 데이터 전처리 & 스택형 데이터로 Transform
### 3) 데이터 저장 또는 적재 (데이터 마트 생성)
- 서버에 가공된 데이터를 적재하거나(Automation), 로컬 환경에 데이터를 저장한 뒤 시각화 대시보드 제작

In [2]:
df = pd.read_csv('bootcamp_chapter4-1_data.csv')

  df = pd.read_csv('bootcamp_chapter4-1_data.csv')


#### 제품 주요 관찰지표
1) Acquisition: 얼마나 많은 신규 유저들을 획득했는지를 확인할 수 있는 지표
2) Activation: 신규 유저 중 Wow Moment를 경험한 유저의 비중을 확인할 수 있는 지표
3) Retention: 목표로 한 주요 행동을 유저가 하고 있는지를 확인할 수 있는 지표

In [3]:
#데이터 전처리
##가입날짜 포맷 'yyyy-mm-dd' datetime 포맷으로 변경
df['join_date'] = df['join_date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

##거래날짜 포맷 'yyyy-mm-dd' datetime 포맷으로 변경
tmp = df[df['transaction_date'].notnull()]
tmp2 = df[df['transaction_date'].isnull()]
tmp['transaction_date'] = tmp['transaction_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

df = pd.concat([tmp, tmp2], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['transaction_date'] = tmp['transaction_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
  df = pd.concat([tmp, tmp2], axis=0)


In [4]:
acquisition = df.groupby('join_date').customer_id.nunique().reset_index()
acquisition.columns = ['partition_day', 'acquisition_cnt']

In [5]:
tmp = df[df['transaction_date'].notnull()]
tmp = tmp.groupby(['customer_id', 'join_date']).transaction_date.max().reset_index()
tmp['transaction_yn'] = 1

tmp2 = pd.merge(df[['customer_id', 'join_date']], tmp[['customer_id', 'transaction_yn']], how='left', on='customer_id')
tmp2.fillna(0, inplace=True)

tmp2 = tmp2.groupby('customer_id')[['join_date', 'transaction_yn']].max().reset_index()

In [6]:
agg_df = tmp2.groupby('join_date').agg({'customer_id': 'nunique',
                                        'transaction_yn': 'sum'}).reset_index()

agg_df.columns = ['partition_date', 'acquisition', 'activation']

agg_df['partition_month'] = agg_df['partition_date'].dt.to_period('M')
agg_df['conversion'] = agg_df['activation'] / agg_df['acquisition']

agg_df = agg_df.melt(id_vars=['partition_month', 'partition_date'], value_vars=['acquisition', 'activation', 'conversion'], var_name='category') 

agg_df

Unnamed: 0,partition_month,partition_date,category,value
0,2020-01,2020-01-01,acquisition,518.000000
1,2020-01,2020-01-02,acquisition,535.000000
2,2020-01,2020-01-03,acquisition,524.000000
3,2020-01,2020-01-04,acquisition,544.000000
4,2020-01,2020-01-05,acquisition,493.000000
...,...,...,...,...
2095,2021-11,2021-11-26,conversion,0.024887
2096,2021-11,2021-11-27,conversion,0.018947
2097,2021-11,2021-11-28,conversion,0.021459
2098,2021-11,2021-11-29,conversion,0.027335


In [7]:
#결제 월과 코호트 월 생성
tmp = df[df['transaction_date'].notnull()]

tmp["transaction_month"] = tmp["transaction_date"].dt.to_period("M")
tmp["cohort_month"] = tmp.groupby("customer_id")["transaction_month"].transform("min")

# 코호트 기간 계산
tmp["period"] = (tmp["transaction_month"] - tmp["cohort_month"]).apply(lambda x: x.n)

# 코호트 테이블 생성
cohort_table = tmp.groupby(["cohort_month", "period"])["customer_id"].nunique().reset_index()

cohort_table.columns = ['cohort_month', 'period', 'user_cnt']

cohort_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp["transaction_month"] = tmp["transaction_date"].dt.to_period("M")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp["cohort_month"] = tmp.groupby("customer_id")["transaction_month"].transform("min")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp["period"] = (tmp["transaction_month"] - tmp["

Unnamed: 0,cohort_month,period,user_cnt
0,2020-01,0,123
1,2020-01,1,4
2,2020-01,2,3
3,2020-01,3,5
4,2020-01,4,5
...,...,...,...
295,2021-10,1,206
296,2021-10,2,198
297,2021-11,0,2712
298,2021-11,1,194


In [8]:
agg_df.to_excel('acq_act.xlsx', index=False)
cohort_table.to_excel('cohort.xlsx', index=False)