# 1. 환경설정

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns



In [5]:
# 지수표현 없애기
pd.options.display.float_format = '{:.2f}'.format

# 2. 데이터 불러오기 및 전처리


## 2-1. 노출 관련 테이블

In [19]:
processed_school_path = 'dumps/processed/processed_accounts_school.csv'
processed_user_contacts_path = 'dumps/processed/processed_accounts_user_contacts.csv'
processed_school = pd.read_csv(processed_school_path)
processed_user_contacts = pd.read_csv(processed_user_contacts_path)

In [7]:
processed_school.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5951 entries, 0 to 5950
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                5951 non-null   int64 
 1   address           5951 non-null   object
 2   student_count     5951 non-null   int64 
 3   school_type       5951 non-null   object
 4   address_clean     5948 non-null   object
 5   is_active_school  5951 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 238.4+ KB


In [None]:
# 전처리
processed_school['id'] = processed_school['id'].astype(str) # id 고유값 문자형 변환
processed_school.rename(columns={'id': 'school_id'}, inplace=True) # id 컬럼 이름 변경

In [14]:
processed_school.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5951 entries, 0 to 5950
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   school_id         5951 non-null   object
 1   address           5951 non-null   object
 2   student_count     5951 non-null   int64 
 3   school_type       5951 non-null   object
 4   address_clean     5948 non-null   object
 5   is_active_school  5951 non-null   bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 238.4+ KB


In [15]:
processed_school.head()

Unnamed: 0,school_id,address,student_count,school_type,address_clean,is_active_school
0,4,충청북도 충주시,239,H,충청북도 충주시,True
1,5,충청북도 충주시,160,M,충청북도 충주시,True
2,6,충청북도 충주시,200,H,충청북도 충주시,True
3,7,충청북도 충주시,114,H,충청북도 충주시,True
4,8,충청북도 충주시,139,M,충청북도 충주시,True


In [16]:
processed_user_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5063 entries, 0 to 5062
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   5063 non-null   int64 
 1   user_id              5063 non-null   int64 
 2   contacts_count       5063 non-null   int64 
 3   invite_user_id_list  1158 non-null   object
 4   invite_cnt           5063 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 197.9+ KB


In [None]:
# 전처리
import ast
def convert_to_list(x):
    try:
        if pd.isna(x) or x == "": return []
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

processed_user_contacts['invite_user_id_list'] = processed_user_contacts['invite_user_id_list'].apply(convert_to_list) # 리스트인척 하는 문자열을 진짜 리스트로 변환
processed_user_contacts['user_id'] = processed_user_contacts['user_id'].astype(str) # id 문자열 형변환
processed_user_contacts.drop(columns='id', inplace=True) # 불필요한 컬럼 제거
processed_user_contacts.rename(columns={'invite_cnt': 'invite_count'}, inplace=True) # 컬럼 이름 변경

## 2-2. 유입 관련 테이블

In [52]:
processed_user_path = 'dumps/processed/processed_accounts_user.csv'
processed_user = pd.read_csv(processed_user_path)

In [53]:
processed_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677080 entries, 0 to 677079
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             677080 non-null  int64  
 1   gender              677080 non-null  object 
 2   point               677080 non-null  int64  
 3   friend_id_list      674210 non-null  object 
 4   is_push_on          677080 non-null  int64  
 5   created_at          677080 non-null  object 
 6   block_user_id_list  677080 non-null  object 
 7   hide_user_id_list   677080 non-null  object 
 8   ban_status          677080 non-null  object 
 9   report_count        677080 non-null  int64  
 10  alarm_count         677080 non-null  int64  
 11  pending_chat        677080 non-null  int64  
 12  pending_votes       677080 non-null  int64  
 13  group_id            677080 non-null  float64
 14  friend_count        677080 non-null  int64  
dtypes: float64(1), int64(8), object(6)

In [54]:
# 전처리
processed_user['created_at'] = pd.to_datetime(processed_user['created_at']).dt.floor('s')
processed_user['user_id'] = processed_user['user_id'].astype(str)

In [57]:
# 기간 설정
timeframe_user_condition = processed_user['created_at'].between('2023-05-13', '2024-05-07')
processed_user = processed_user[timeframe_user_condition]

In [58]:
processed_user.describe()

Unnamed: 0,point,is_push_on,created_at,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
count,363208.0,363208.0,363208,363208.0,363208.0,363208.0,363208.0,363208.0,363208.0
mean,1705.89,0.84,2023-05-23 02:00:42.264344320,0.03,0.88,0.09,66.42,49870.43,48.09
min,0.0,0.0,2023-05-13 00:00:02,0.0,0.0,-1.0,0.0,1.0,0.0
25%,383.0,1.0,2023-05-15 17:40:55.500000,0.0,1.0,0.0,1.0,37476.0,27.0
50%,912.0,1.0,2023-05-19 19:47:55.500000,0.0,1.0,0.0,17.0,53135.0,43.0
75%,2096.0,1.0,2023-05-24 13:41:35,0.0,1.0,0.0,89.0,64868.0,64.0
max,23115730.0,1.0,2024-05-06 22:48:40,71.0,238.0,1801.0,2744.0,84544.0,1373.0
std,41836.5,0.36,,0.35,1.01,3.06,106.41,19887.69,30.9


## 2-3. 참여 관련 테이블

In [60]:
processed_questionset_path = 'dumps/processed/processed_polls_questionset.csv'
processed_questionset = pd.read_csv(processed_questionset_path)

In [61]:
processed_questionset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157705 entries, 0 to 157704
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   id                      157705 non-null  int64 
 1   question_piece_id_list  157705 non-null  object
 2   opening_time            157705 non-null  object
 3   status                  157705 non-null  object
 4   created_at              157705 non-null  object
 5   user_id                 157705 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 7.2+ MB


In [62]:
# 기본 전처리
processed_questionset['question_piece_id_list'] = processed_questionset['question_piece_id_list'].apply(convert_to_list)
processed_questionset['created_at'] = pd.to_datetime(processed_questionset['created_at']).dt.floor('s')
processed_questionset['user_id'] = processed_questionset['user_id'].astype(str)
processed_questionset.drop(columns='id', inplace=True) # 불필요한 컬럼 제거

In [63]:
# 기간 설정
timeframe_questionset_condition = processed_questionset['created_at'].between('2023-05-13', '2024-05-07')
processed_questionset = processed_questionset[timeframe_questionset_condition]

In [64]:
processed_questionset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102120 entries, 55582 to 157701
Data columns (total 5 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   question_piece_id_list  102120 non-null  object        
 1   opening_time            102120 non-null  object        
 2   status                  102120 non-null  object        
 3   created_at              102120 non-null  datetime64[ns]
 4   user_id                 102120 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 4.7+ MB


In [36]:
processed_questionset.head()

Unnamed: 0,question_piece_id_list,opening_time,status,created_at,user_id
0,"[998588, 998589, 998590, 998591, 998592, 99859...",2023-04-28 12:28:07,F,2023-04-28 12:28:07,849438
1,"[998689, 998691, 998693, 998695, 998697, 99869...",2023-04-28 12:28:38,F,2023-04-28 12:28:38,847375
2,"[998688, 998690, 998692, 998694, 998696, 99869...",2023-04-28 12:28:38,F,2023-04-28 12:28:38,849446
3,"[998768, 998769, 998770, 998771, 998772, 99877...",2023-04-28 12:28:57,F,2023-04-28 12:28:57,849477
4,"[998808, 998809, 998810, 998811, 998813, 99881...",2023-04-28 12:29:04,F,2023-04-28 12:29:04,849469


## 2-4. 수익관련 테이블

In [42]:
processed_payment_path = 'dumps/processed/processed_accounts_paymenthistory.csv'
processed_payment = pd.read_csv(processed_payment_path)
processed_failpayment_path = 'dumps/processed/processed_accounts_failpaymenthistory.csv'
processed_failpayment = pd.read_csv(processed_failpayment_path)

In [43]:
processed_payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95140 entries, 0 to 95139
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     95140 non-null  int64 
 1   productId   95140 non-null  object
 2   phone_type  95140 non-null  object
 3   created_at  95140 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.9+ MB


In [44]:
processed_failpayment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          160 non-null    int64 
 1   user_id     160 non-null    int64 
 2   productId   56 non-null     object
 3   phone_type  160 non-null    object
 4   created_at  160 non-null    object
dtypes: int64(2), object(3)
memory usage: 6.4+ KB


In [45]:
# 기본 전처리
processed_payment['created_at'] = pd.to_datetime(processed_payment['created_at']).dt.floor('s')
processed_payment['user_id'] = processed_payment['user_id'].astype(str)
# 상품명 하트수(숫자)로 변환
product_map = {
    'heart.777': 777,
    'heart.200': 200,
    'heart.4000': 4000,
    'heart.1000': 1000
}
processed_payment['productId'] = processed_payment['productId'].map(product_map)

processed_failpayment['created_at'] = pd.to_datetime(processed_failpayment['created_at']).dt.floor('s')
processed_failpayment['user_id'] = processed_failpayment['user_id'].astype(str)
processed_failpayment = processed_failpayment.drop(columns='id')

In [65]:
# 기간 설정
timeframe_payment_condition = processed_payment['created_at'].between('2023-05-13', '2024-05-07')
processed_payment = processed_payment[timeframe_payment_condition]

In [66]:
processed_payment.info()
print('=' * 10)
processed_failpayment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95136 entries, 0 to 95135
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     95136 non-null  object        
 1   productId   95136 non-null  int64         
 2   phone_type  95136 non-null  object        
 3   created_at  95136 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 3.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     160 non-null    object        
 1   productId   56 non-null     object        
 2   phone_type  160 non-null    object        
 3   created_at  160 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 5.1+ KB


# 3. 데이터 파악

In [67]:
processed_user['user_id'].nunique()

363208