# 1. 환경설정

In [1]:
import pandas as pd
import numpy as np
import koreanize_matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import ast
from pathlib import Path

# 2. Votes 전처리

In [72]:
ROOT = Path.cwd()
ORIG_DIR = ROOT / 'dumps/votes_csv'
HACKLE_DIR = ROOT / 'dumps/hackle_csv'
PROCESSED_DIR = ROOT / 'dumps/processed_csv'

In [8]:
# accounts_userquestionrecord = pd.read_csv(ORIG_DIR / 'userquestionrecord.csv')
# accounts_group = pd.read_csv(ORIG_DIR / 'accounts_group.csv')
# accounts_attendance = pd.read_csv(ORIG_DIR / 'accounts_attendance.csv')

accounts_paymenthistory = pd.read_csv(ORIG_DIR / 'accounts_paymenthistory.csv')
accounts_failpaymenthistory = pd.read_csv(ORIG_DIR / 'accounts_failpaymenthistory.csv')
accounts_user = pd.read_csv(ORIG_DIR / 'accounts_user.csv')
polls_questionset = pd.read_csv(ORIG_DIR / 'polls_questionset.csv')
accounts_user_contacts  = pd.read_csv(ORIG_DIR / 'accounts_user_contacts.csv')
accounts_school  = pd.read_csv(ORIG_DIR / 'accounts_school.csv')

In [9]:
# 날짜 확인
for name, df in {
    "fail": accounts_failpaymenthistory,
    "pay": accounts_paymenthistory,
    "user": accounts_user,
    "poll": polls_questionset,
    "contacts": accounts_user_contacts,
    "school": accounts_school
}.items():
    print(name, ":", df.shape)

fail : (163, 5)
pay : (95140, 5)
user : (677085, 16)
poll : (158384, 6)
contacts : (5063, 4)
school : (5951, 4)


In [10]:
tables = {
    "accounts_failpaymenthistory": accounts_failpaymenthistory,
    "accounts_paymenthistory": accounts_paymenthistory,
    "accounts_user": accounts_user,
    "polls_questionset": polls_questionset,
    "accounts_user_contacts": accounts_user_contacts,
    "accounts_school": accounts_school
}

In [11]:
for name, df in tables.items():
    if "created_at" in df.columns:
        dt = pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s")

        print(f"\n[{name}]")
        print("최소 날짜:", dt.min())
        print("최대 날짜:", dt.max())
        print("파싱 실패:", dt.isna().sum())


[accounts_failpaymenthistory]
최소 날짜: 2023-05-14 05:49:22
최대 날짜: 2023-09-17 09:12:53
파싱 실패: 0

[accounts_paymenthistory]
최소 날짜: 2023-05-13 21:28:34
최대 날짜: 2024-05-08 14:12:45
파싱 실패: 0

[accounts_user]
최소 날짜: 2023-03-29 03:44:14
최대 날짜: 2024-05-09 08:31:17
파싱 실패: 0

[polls_questionset]
최소 날짜: 2023-04-28 12:27:23
최대 날짜: 2024-05-07 11:32:30
파싱 실패: 0


## 2-1. accounts_failpaymenthistory 
- [전처리 기준]

1. 전체 row 중복: 없음
2. user_id 기준 중복 결제 기록 존재
    중복 시 최신 created_at 기준 keep='last'
3. 컬럼 유지: id, user_id, productId, phone_type, created_at
4. create_at 데이터 파싱, ns 삭제
5. 결과 저장 위치: clean_vote_ver2/

In [12]:
print("shape:", accounts_failpaymenthistory.shape)
accounts_failpaymenthistory.head()
accounts_failpaymenthistory.dtypes

shape: (163, 5)


id             int64
productId     object
phone_type    object
created_at    object
user_id        int64
dtype: object

In [13]:
accounts_failpaymenthistory["created_at"] = (pd.to_datetime(accounts_failpaymenthistory["created_at"], errors="coerce").dt.floor("s"))

summary = pd.DataFrame([{
    "row_cnt": len(accounts_failpaymenthistory),
    "null_id": accounts_failpaymenthistory["id"].isna().sum(),
    "null_user_id": accounts_failpaymenthistory["user_id"].isna().sum(),
    "null_productId": accounts_failpaymenthistory["productId"].isna().sum(),
    "null_phone_type": accounts_failpaymenthistory["phone_type"].isna().sum(),
    "null_created_at": accounts_failpaymenthistory["created_at"].isna().sum(),
    "non_positive_id": (accounts_failpaymenthistory["id"] <= 0).sum(),
    "non_positive_user_id": (accounts_failpaymenthistory["user_id"] <= 0).sum(),
}])
summary

Unnamed: 0,row_cnt,null_id,null_user_id,null_productId,null_phone_type,null_created_at,non_positive_id,non_positive_user_id
0,163,0,0,107,0,0,0,0


In [14]:
bad_dt = accounts_failpaymenthistory[accounts_failpaymenthistory["created_at"].isna()][["id", "user_id", "productId", "phone_type", "created_at"]]

print("created_at 파싱 실패 건수:", len(bad_dt))
bad_dt.head(20)

print("created_at min:", accounts_failpaymenthistory["created_at"].min())
print("created_at max:", accounts_failpaymenthistory["created_at"].max())

created_at 파싱 실패 건수: 0
created_at min: 2023-05-14 05:49:22
created_at max: 2023-09-17 09:12:53


In [15]:
dup_all_cnt = accounts_failpaymenthistory.duplicated(keep=False).sum()
print("완전 동일 행 중복(keep=False):", dup_all_cnt)

완전 동일 행 중복(keep=False): 0


In [16]:
accounts_failpaymenthistory_clean = (
    accounts_failpaymenthistory.sort_values(["user_id", "created_at"], ascending=[True, True])
    .drop_duplicates(subset=["user_id"], keep="last")
    .reset_index(drop=True)
)

print("원본 행 수:", len(accounts_failpaymenthistory))
print("정리 후 행 수:", len(accounts_failpaymenthistory_clean))
print("제거된 행 수:", len(accounts_failpaymenthistory) - len(accounts_failpaymenthistory_clean))
print("유니크 여부:", accounts_failpaymenthistory_clean["user_id"].is_unique)

원본 행 수: 163
정리 후 행 수: 160
제거된 행 수: 3
유니크 여부: True


In [24]:
out_path = PROCESSED_DIR / "processed_accounts_failpaymenthistory.csv"
accounts_failpaymenthistory_clean[["id", "user_id", "productId", "phone_type", "created_at"]].to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_accounts_failpaymenthistory.csv


## 2-2. accounts_paymenthistory
- [전처리 기준]
1. 데이터 수집 기간
    min   2023-05-13 21:28:34
    max   2024-05-08 14:12:45
2. create_at 파싱
    ns 삭제

In [20]:
print("shape:", accounts_paymenthistory.shape)
accounts_paymenthistory.head()
accounts_paymenthistory.dtypes

shape: (95140, 5)


id             int64
productId     object
phone_type    object
created_at    object
user_id        int64
dtype: object

In [21]:
summary = pd.DataFrame([{
    "row_cnt": len(accounts_paymenthistory),
    "null_user_id": accounts_paymenthistory["user_id"].isna().sum(),
    "null_productId": accounts_paymenthistory["productId"].isna().sum(),
    "null_phone_type": accounts_paymenthistory["phone_type"].isna().sum(),
    "null_created_at": accounts_paymenthistory["created_at"].isna().sum(),
    "duplicate_all_rows": accounts_paymenthistory.duplicated(keep=False).sum(),
    "duplicate_user_id": accounts_paymenthistory.duplicated(subset=["user_id"], keep=False).sum(),
}])

summary

Unnamed: 0,row_cnt,null_user_id,null_productId,null_phone_type,null_created_at,duplicate_all_rows,duplicate_user_id
0,95140,0,0,0,0,0,52091


In [22]:
accounts_paymenthistory["user_id"].nunique()

59192

In [23]:
accounts_paymenthistory["created_at"] = (
    pd.to_datetime(accounts_paymenthistory["created_at"], errors="coerce").dt.floor("s")
)

print("created_at dtype:", accounts_paymenthistory["created_at"].dtype)
print("created_at min/max:")
accounts_paymenthistory["created_at"].agg(["min", "max"])

created_at dtype: datetime64[ns]
created_at min/max:


min   2023-05-13 21:28:34
max   2024-05-08 14:12:45
Name: created_at, dtype: datetime64[ns]

In [26]:
out_path = PROCESSED_DIR / "processed_accounts_paymenthistory.csv"

accounts_paymenthistory[["user_id", "productId", "phone_type", "created_at"]].to_csv(
    out_path,
    index=False,
    encoding="utf-8-sig"
)
print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_accounts_paymenthistory.csv


## 2-3. accounts_school
- [전처리 기준]
1. 주소 정제
    담연님 코드 참고('서울'과 '서울 ' 구분 주의)
2. 학생 수가 40명이 넘는 곳과 그렇지 않는 곳을 구분하고자 한다고 말씀주셔서, 추가 컬럼 생성   
    is_active_school: 학생 수가 40 이상이면 True, 미만이면 False 

In [27]:
print("shape:", accounts_school.shape)
accounts_school.head()
accounts_school.dtypes

shape: (5951, 4)


id                int64
address          object
student_count     int64
school_type      object
dtype: object

In [28]:
null_summary = accounts_school.isna().sum().to_frame("null_cnt")
null_summary["null_ratio"] = null_summary["null_cnt"] / len(accounts_school)

null_summary

Unnamed: 0,null_cnt,null_ratio
id,0,0.0
address,0,0.0
student_count,0,0.0
school_type,0,0.0


In [29]:
dup_all_cnt = accounts_school.duplicated(keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

완전 동일 행 중복 수: 0


In [30]:
def clean_address(addr):
    if pd.isna(addr) or addr == '-':
        return None

    addr = addr.replace('대한민국 ', '')

    city_map = {
        '서울 ': '서울특별시',
        '경기 ': '경기도',
        '인천 ': '인천광역시',
        '대전 ': '대전광역시',
        '대구 ': '대구광역시',
        '부산 ': '부산광역시',
        '울산 ': '울산광역시',
        '광주 ': '광주광역시',
        '강원 ': '강원도',
        '충남 ': '충청남도',
        '충북 ': '충청북도',
        '경남 ': '경상남도',
        '경북 ': '경상북도',
        '전남 ': '전라남도',
        '전북 ': '전라북도',
        '제주 ': '제주특별자치도'
    }

    for short, long in city_map.items():
        if addr.startswith(short):
            return addr.replace(short, long, 1)

    return addr

accounts_school["address_clean"] = accounts_school["address"].apply(clean_address)

In [31]:
changed_cnt = (accounts_school["address"] != accounts_school["address_clean"]).sum()
print("주소가 변경된 행 수:", changed_cnt)

주소가 변경된 행 수: 31


In [32]:
accounts_school["address_clean"].value_counts().head(10)

address_clean
경기도 화성시        78
경기도 부천시        60
경상남도 김해시       58
경기도 남양주시       58
대구광역시 달서구      56
서울특별시 노원구      54
경기도 성남시 분당구    53
서울특별시 송파구      52
제주특별자치도 제주시    52
광주광역시 북구       52
Name: count, dtype: int64

In [33]:
accounts_school["is_active_school"] = accounts_school["student_count"] >= 40

In [34]:
accounts_school.head(10)

Unnamed: 0,id,address,student_count,school_type,address_clean,is_active_school
0,4,충청북도 충주시,239,H,충청북도 충주시,True
1,5,충청북도 충주시,160,M,충청북도 충주시,True
2,6,충청북도 충주시,200,H,충청북도 충주시,True
3,7,충청북도 충주시,114,H,충청북도 충주시,True
4,8,충청북도 충주시,139,M,충청북도 충주시,True
5,9,충청북도 충주시,3,H,충청북도 충주시,False
6,10,충청북도 충주시,159,M,충청북도 충주시,True
7,11,충청북도 충주시,17,M,충청북도 충주시,False
8,12,충청북도 충주시,154,M,충청북도 충주시,True
9,13,충청북도 충주시,80,H,충청북도 충주시,True


In [35]:
out_path = PROCESSED_DIR / "processed_accounts_school.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_accounts_school.csv


## 2-4. accounts_user_contacts
- [전처리 기준]
1. 유저 아이디 형변환: int -> str
2. 그냥 아이디는 그대로 둠
3. invite user id list는 리스트로 형변환, 빈 리스트는 null로 대체
4. invite cnt 생성

In [36]:
print("shape:", accounts_user_contacts.shape)
accounts_user_contacts.head()
accounts_user_contacts.dtypes

shape: (5063, 4)


id                      int64
contacts_count          int64
invite_user_id_list    object
user_id                 int64
dtype: object

In [37]:
summary = pd.DataFrame([{
    "row_cnt": len(accounts_user_contacts),
    "null_id": accounts_user_contacts["id"].isna().sum(),
    "null_user_id": accounts_user_contacts["user_id"].isna().sum(),
    "null_contacts_count": accounts_user_contacts["contacts_count"].isna().sum(),
    "null_invite_user_id_list": accounts_user_contacts["invite_user_id_list"].isna().sum(),
}])
summary

Unnamed: 0,row_cnt,null_id,null_user_id,null_contacts_count,null_invite_user_id_list
0,5063,0,0,0,0


In [38]:
accounts_user_contacts["user_id"] = accounts_user_contacts["user_id"].astype("Int64").astype("string")

In [39]:
print(accounts_user_contacts["user_id"].dtype)
accounts_user_contacts[["user_id"]].head()

string


Unnamed: 0,user_id
0,1167696
1,863169
2,857205
3,851431
4,855476


In [40]:
neg_contacts = accounts_user_contacts[accounts_user_contacts["contacts_count"] < 0]
print("contacts_count 음수 건수:", len(neg_contacts))
display(neg_contacts.head(20))

contacts_count 음수 건수: 0


Unnamed: 0,id,contacts_count,invite_user_id_list,user_id


In [41]:
def parse_invite_list(x):
    if pd.isna(x):
        return np.nan
    try:
        v = ast.literal_eval(x)
        if isinstance(v, list) and len(v) > 0:
            return [int(i) for i in v if pd.notna(i)]
        else:
            return np.nan
    except Exception:
        return np.nan

accounts_user_contacts["invite_user_id_list"] = accounts_user_contacts["invite_user_id_list"].apply(parse_invite_list)

In [42]:
accounts_user_contacts[["invite_user_id_list"]].head(20)

Unnamed: 0,invite_user_id_list
0,
1,
2,[854615]
3,
4,[849318]
5,[855829]
6,"[849318, 849421]"
7,
8,"[855626, 856042, 837947]"
9,


In [43]:
accounts_user_contacts["invite_cnt"] = accounts_user_contacts["invite_user_id_list"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

In [44]:
type(accounts_user_contacts.loc[0, "invite_user_id_list"])

float

In [45]:
accounts_user_contacts["invite_user_id_list"].apply(type).value_counts()

invite_user_id_list
<class 'float'>    3905
<class 'list'>     1158
Name: count, dtype: int64

In [46]:
dup_cols = ["id", "user_id", "contacts_count"]

dup_all_cnt = accounts_user_contacts.duplicated(subset=dup_cols, keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

dup_user_cnt = accounts_user_contacts.duplicated(subset=["user_id"], keep=False).sum()
print("user_id 중복 행 수:", dup_user_cnt)

완전 동일 행 중복 수: 0
user_id 중복 행 수: 0


In [47]:
accounts_user_contacts.head(20)

Unnamed: 0,id,contacts_count,invite_user_id_list,user_id,invite_cnt
0,259,30,,1167696,0
1,1756,79,,863169,0
2,13742,21,[854615],857205,1
3,13754,29,,851431,0
4,13756,28,[849318],855476,1
5,13784,31,[855829],1482744,1
6,13798,45,"[849318, 849421]",854615,2
7,13807,28,,854372,0
8,13815,26,"[855626, 856042, 837947]",858674,3
9,21155,28,,855526,0


In [48]:
out_path = PROCESSED_DIR / "processed_accounts_user_contacts.csv"

accounts_user_contacts_out = accounts_user_contacts[["id", "user_id", "contacts_count", "invite_user_id_list", "invite_cnt"]].copy()

accounts_user_contacts_out.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_accounts_user_contacts.csv


## 2-5. accounts_user
- [전처리 기준]
1. id -> user_id 컬럼명 변경, str로 형변환
2. is_superuser, is_staff 컬럼 drop
    gender, group_id 결측 자연스럽게 사라짐
3. friend_id_list 형변환
4. friend_count 컬럼 생성

In [49]:
print("shape:", accounts_user.shape)
accounts_user.dtypes

shape: (677085, 16)


id                      int64
is_superuser            int64
is_staff                int64
gender                 object
point                   int64
friend_id_list         object
is_push_on              int64
created_at             object
block_user_id_list     object
hide_user_id_list      object
ban_status             object
report_count            int64
alarm_count             int64
pending_chat            int64
pending_votes           int64
group_id              float64
dtype: object

In [50]:
accounts_user.head(3)

Unnamed: 0,id,is_superuser,is_staff,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id
0,831956,1,1,,600,"[1292473, 913158, 1488461, 1064695, 1043565, 1...",0,2023-03-29 03:44:14.047130,[],[],N,0,0,0,0,
1,831962,0,0,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 05:18:56.162368,[],[],N,253,40878,5499,110,12.0
2,832151,0,0,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 12:56:34.989468,[],[],N,0,37,0,47,1.0


In [51]:
print("제거 전 행 수:", len(accounts_user))

before = len(accounts_user)

accounts_user = accounts_user[(accounts_user["is_staff"] == 0) & (accounts_user["is_superuser"] == 0)].copy()

after = len(accounts_user)

print("제거 후 행 수:", after)
print("제거된 행 수:", before - after)

제거 전 행 수: 677085
제거 후 행 수: 677081
제거된 행 수: 4


In [52]:
print("group_id 결측 제거 전 행 수:", len(accounts_user))
before = len(accounts_user)

accounts_user = accounts_user[accounts_user["group_id"].notna()].copy()

after = len(accounts_user)
print("group_id 결측 제거 후 행 수:", after)
print("제거된 행 수:", before - after)

group_id 결측 제거 전 행 수: 677081
group_id 결측 제거 후 행 수: 677080
제거된 행 수: 1


In [54]:
accounts_user.info()

<class 'pandas.core.frame.DataFrame'>
Index: 677080 entries, 1 to 677084
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  677080 non-null  int64  
 1   is_superuser        677080 non-null  int64  
 2   is_staff            677080 non-null  int64  
 3   gender              677080 non-null  object 
 4   point               677080 non-null  int64  
 5   friend_id_list      677080 non-null  object 
 6   is_push_on          677080 non-null  int64  
 7   created_at          677080 non-null  object 
 8   block_user_id_list  677080 non-null  object 
 9   hide_user_id_list   677080 non-null  object 
 10  ban_status          677080 non-null  object 
 11  report_count        677080 non-null  int64  
 12  alarm_count         677080 non-null  int64  
 13  pending_chat        677080 non-null  int64  
 14  pending_votes       677080 non-null  int64  
 15  group_id            677080 non-null  fl

In [55]:
null_summary = pd.DataFrame({
    "null_cnt": accounts_user.isna().sum()
}).sort_values("null_cnt", ascending=False)

null_summary

Unnamed: 0,null_cnt
id,0
is_superuser,0
is_staff,0
gender,0
point,0
friend_id_list,0
is_push_on,0
created_at,0
block_user_id_list,0
hide_user_id_list,0


In [56]:
accounts_user = accounts_user.drop(columns=["is_staff", "is_superuser"], errors="ignore")

print("삭제 후 컬럼:", accounts_user.columns.tolist())

삭제 후 컬럼: ['id', 'gender', 'point', 'friend_id_list', 'is_push_on', 'created_at', 'block_user_id_list', 'hide_user_id_list', 'ban_status', 'report_count', 'alarm_count', 'pending_chat', 'pending_votes', 'group_id']


In [57]:
dup_all_cnt = accounts_user.duplicated(keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

if dup_all_cnt > 0:
    display(accounts_user[accounts_user.duplicated(keep=False)].head(20))

완전 동일 행 중복 수: 0


In [58]:
accounts_user = accounts_user.rename(columns={"id": "user_id"})
accounts_user["user_id"] = accounts_user["user_id"].astype("Int64").astype("string")  # 결측 안전

In [59]:
accounts_user["created_at"] = pd.to_datetime(accounts_user["created_at"], errors="coerce").dt.floor("s")

In [60]:
accounts_user['created_at'] = accounts_user['created_at'] + timedelta(hours=9)

In [61]:
def parse_listlike(x):

    if pd.isna(x):
        return np.nan
    s = str(x).strip()

    if s == "" or s == "[]" or s.lower() == "nan":
        return np.nan

    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            out = []
            for i in v:
                if i is None or (isinstance(i, float) and np.isnan(i)):
                    continue
                try:
                    out.append(int(i))
                except Exception:
                    out.append(i)
            return out if len(out) > 0 else np.nan
        return np.nan
    except Exception:
        return np.nan

In [62]:
if "friend_id_list" in accounts_user.columns:
    accounts_user["friend_id_list"] = accounts_user["friend_id_list"].apply(parse_listlike)
    accounts_user["friend_count"] = accounts_user["friend_id_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [63]:
accounts_user.head(10)

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
1,831962,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 14:18:56,[],[],N,253,40878,5499,110,12.0,43
2,832151,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 21:56:34,[],[],N,0,37,0,47,1.0,51
3,832340,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 21:56:35,[],[],N,0,19,0,21,1.0,57
4,832520,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 21:56:35,[],[],N,0,29,0,15,12.0,18
5,832614,M,1048,"[838541, 833041, 832151, 837806, 1437874, 1142...",1,2023-03-29 21:56:35,[],[],N,0,28,0,14,12.0,21
6,832740,M,1094,"[874050, 849763, 832894, 832614, 837806, 83304...",0,2023-03-29 22:20:46,[],[],NB,0,26,0,3,12.0,15
7,832857,M,1439,"[874050, 832894, 832740, 832614, 837806, 83304...",1,2023-03-29 22:20:46,[],[],N,0,28,0,16,12.0,16
8,832894,M,1535,"[982531, 879496, 833041, 832151, 1082907, 1426...",1,2023-03-29 22:20:46,[],[],N,0,36,0,24,1.0,34
9,832920,F,213,"[982531, 882567, 836496, 833041, 836498, 83215...",1,2023-03-29 22:20:46,[],[],N,0,35,0,18,1.0,26
10,832986,M,305,"[838785, 1426466, 874050, 832740, 832894, 8326...",1,2023-03-29 22:20:46,[],[],N,0,26,0,19,12.0,18


In [64]:
print("gender 결측:", accounts_user["gender"].isna().sum())
print("group_id 결측:", accounts_user["group_id"].isna().sum())

gender 결측: 0
group_id 결측: 0


In [65]:
out_path = PROCESSED_DIR / "processed_accounts_user.csv"
accounts_user.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_accounts_user.csv


## 2-6. polls_questionset
- [전처리 기준]
1. create at 시간 파싱
    ns 제거
2. create > open 이상치 679건 제거

In [66]:
print("shape:", polls_questionset.shape)
polls_questionset.head()
polls_questionset.dtypes

shape: (158384, 6)


id                         int64
question_piece_id_list    object
opening_time              object
status                    object
created_at                object
user_id                    int64
dtype: object

In [67]:
for col in ["created_at", "opening_time"]:
    polls_questionset[col] = (
        pd.to_datetime(polls_questionset[col], errors="coerce")
        .dt.floor("s")
    )

In [68]:
polls_questionset['created_at'] = polls_questionset['created_at'] + timedelta(hours=9)

In [69]:
print("created_at null:", polls_questionset["created_at"].isna().sum())
print("opening_time null:", polls_questionset["opening_time"].isna().sum())

created_at null: 0
opening_time null: 0


In [70]:
invalid_time = polls_questionset["created_at"] > polls_questionset["opening_time"]

print("created_at > opening_time 건수:", invalid_time.sum())

polls_questionset_clean = polls_questionset.loc[~invalid_time].reset_index(drop=True)

print("원본 행 수:", len(polls_questionset))
print("정리 후 행 수:", len(polls_questionset_clean))
print("삭제된 행 수:", len(polls_questionset) - len(polls_questionset_clean))

created_at > opening_time 건수: 158314
원본 행 수: 158384
정리 후 행 수: 70
삭제된 행 수: 158314


In [71]:
out_path = PROCESSED_DIR / "processed_questionset_clean.csv"
polls_questionset_clean.to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)

saved: d:\나\공부\데이터 분석\03_고급_프로젝트\code\dumps\test\processed_questionset_clean.csv


# 3. Hackle 전처리

## 3-1. hackle_merge(1)
[user_id 중복값, 결측치 처리]
- hackle_merge: hackle_events + hackle_properties

In [None]:
hackle_events = pd.read_csv(HACKLE_DIR / 'hackle_events.csv')
hackle_properties = pd.read_csv(HACKLE_DIR / 'hackle_properties.csv')
accounts_user = pd.read_csv(PROCESSED_DIR / 'processed_accounts_user.csv')

In [7]:
accounts_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677080 entries, 0 to 677079
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             677080 non-null  int64  
 1   gender              677080 non-null  object 
 2   point               677080 non-null  int64  
 3   friend_id_list      674210 non-null  object 
 4   is_push_on          677080 non-null  int64  
 5   created_at          677080 non-null  object 
 6   block_user_id_list  677080 non-null  object 
 7   hide_user_id_list   677080 non-null  object 
 8   ban_status          677080 non-null  object 
 9   report_count        677080 non-null  int64  
 10  alarm_count         677080 non-null  int64  
 11  pending_chat        677080 non-null  int64  
 12  pending_votes       677080 non-null  int64  
 13  group_id            677080 non-null  float64
 14  friend_count        677080 non-null  int64  
dtypes: float64(1), int64(8), object(6)

In [3]:
accounts_user['user_id'] = accounts_user['user_id'].astype(str)

In [22]:
accounts_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677080 entries, 0 to 677079
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             677080 non-null  object 
 1   gender              677080 non-null  object 
 2   point               677080 non-null  int64  
 3   friend_id_list      674210 non-null  object 
 4   is_push_on          677080 non-null  int64  
 5   created_at          677080 non-null  object 
 6   block_user_id_list  677080 non-null  object 
 7   hide_user_id_list   677080 non-null  object 
 8   ban_status          677080 non-null  object 
 9   report_count        677080 non-null  int64  
 10  alarm_count         677080 non-null  int64  
 11  pending_chat        677080 non-null  int64  
 12  pending_votes       677080 non-null  int64  
 13  group_id            677080 non-null  float64
 14  friend_count        677080 non-null  int64  
dtypes: float64(1), int64(7), object(7)

In [4]:
# events, properties merge
hackle_merge = pd.merge(hackle_properties, hackle_events, on='session_id', how='left')

In [25]:
# 내부데이터의 user list 추출해서 hackle event에 필터링
users = accounts_user['user_id'].unique()
filtered_hackle_merge = hackle_merge[hackle_merge['user_id'].isin(users)] # 필터링된 hackle

In [None]:
# 담연님 코드 참고했슴다!!!
# 헤헤 감사합니당

# 잘 필터링 됐는지 확인용 (1)
filtered_session_duplicated_user = filtered_hackle_merge.groupby(by='session_id').agg(user_count=('user_id', 'nunique')).reset_index().sort_values(by='user_count', ascending=False)
filtered_sdu_cond1 = filtered_session_duplicated_user['user_count'] > 1
filtered_hackle_merge_duplicated_user = filtered_session_duplicated_user[filtered_sdu_cond1]
print(f"필터링된 해클 이벤트에서 한 세션당 여러개의 유저아이디를 가진 세션 개수: {len(filtered_hackle_merge_duplicated_user)}")

필터링된 해클 이벤트에서 한 세션당 여러개의 유저아이디를 가진 세션 개수: 52


In [27]:
# 잘 필터링 됐는지 확인용 (2)
filtered_hackle_merge_duplicated_user.head(10)

Unnamed: 0,session_id,user_count
947,040914e1-61ac-40ef-b76a-718066d880dc,5
65740,B941F9F9-CF53-4DAE-A204-75E666B5D277,3
55479,9803ebdb-0c89-4d93-b248-71fd1982e550,2
22852,3i1Ja5p3EOM5o7b1UQGL9MHchHp2,2
56146,9B0ABDE8-D744-4462-AF6D-876B2DE45CEA,2
34366,5c89abcf-a789-4944-8c26-da288ee48d2a,2
386,01D2BF59-D959-4381-8F2B-C979C003FF3E,2
98413,JZWG739A2gVOXuDZwx8l3MEF9zN2,2
71453,CF5F77A4-2C29-456F-9E23-68E403D0B960,2
106226,MFiut9juQMOsatlqKSWzWIvCsTw1,2


In [None]:
# 아???
# 필터링하면서 자동으로 NULL이 사라졌구나 헐
filtered_hackle_merge[filtered_hackle_merge['user_id'].isnull()]

Unnamed: 0,id_x,session_id,user_id,language,osname,osversion,versionname,device_id,event_id,event_datetime,event_key,id_y,item_name,page_name,friend_count,votes_count,heart_balance,question_id


In [5]:
# 그냥,,, 따로따로 구해서 concat하는 거로,,,,,ㅠ
# NULL값들만 concat해야 함 
# 안그럼 중복값들 파티 ~!!
null_session_id = hackle_merge[hackle_merge['user_id'].isnull()]['session_id'].unique()

# null과 user_id를 가지고 있는 session_id를 가진 행을 모두 빼줌
both_user_hackle = hackle_merge[hackle_merge['session_id'].isin(null_session_id)]

In [6]:
# ession_id 별 user_id의 종류 확인 -> set으로 중복값 없음
both_user_kind = both_user_hackle.groupby('session_id')['user_id'].apply(set).reset_index()
both_user_kind['user_id'] = both_user_kind['user_id'].apply(list)
both_user_kind

Unnamed: 0,session_id,user_id
0,000137bc-80de-4bb5-b61d-df7f217a4501,[nan]
1,00039F52-AC9A-4474-A645-DFE225C56753,[nan]
2,0004482B-03B0-4638-8C5A-C2E49C085C7F,[nan]
3,000638C3-2BC2-4068-A482-861B526B2651,"[860893, nan]"
4,0009B464-970A-4030-9990-C28F90D4D0D8,[nan]
...,...,...
78192,zzqcZU6eYRQZtZzDABeLgNFmjTj2,"[1419700, nan]"
78193,zzwY14BTCLUB13lFw8g7FETq1Wn1,"[1434640, nan]"
78194,zzwY2oU4oAZcU7Y1Lr2Wqd8wGEo2,"[1017419, nan]"
78195,zzwdcJbazOPmYWLKoUFQKhvZTRu1,"[1122713, nan]"


In [7]:
# kind의 개수 확인
both_user_kind['kind_counts'] = both_user_kind['user_id'].apply(len)
both_user_kind

Unnamed: 0,session_id,user_id,kind_counts
0,000137bc-80de-4bb5-b61d-df7f217a4501,[nan],1
1,00039F52-AC9A-4474-A645-DFE225C56753,[nan],1
2,0004482B-03B0-4638-8C5A-C2E49C085C7F,[nan],1
3,000638C3-2BC2-4068-A482-861B526B2651,"[860893, nan]",2
4,0009B464-970A-4030-9990-C28F90D4D0D8,[nan],1
...,...,...,...
78192,zzqcZU6eYRQZtZzDABeLgNFmjTj2,"[1419700, nan]",2
78193,zzwY14BTCLUB13lFw8g7FETq1Wn1,"[1434640, nan]",2
78194,zzwY2oU4oAZcU7Y1Lr2Wqd8wGEo2,"[1017419, nan]",2
78195,zzwdcJbazOPmYWLKoUFQKhvZTRu1,"[1122713, nan]",2


In [None]:
both_user_kind[both_user_kind['kind_counts'] > 3]

Unnamed: 0,session_id,user_id,kind_counts
451,040914e1-61ac-40ef-b76a-718066d880dc,"[nan, 838541, 1577954, 1577938, 849763, 1577930]",6
1281,0CC53548-EB9A-4AE5-A72E-5D765B3BB1F9,"[nan, 1187499, jj0KB3IjtoOfKRXGhKDOdUIsM6G3, 1...",4
5170,23235867-49A8-4240-8B34-6342357440F9,"[nan, 967617, 1579185, 1572211]",4
20442,87046FBA-B84B-4124-ADE9-EA03DF95E7B8,"[89BQd6PzUzOkAQFKAHJvMFAlQs22, nan, 1327236, 1...",4
27287,B941F9F9-CF53-4DAE-A204-75E666B5D277,"[nan, 1579057, 1579831, 947584]",4


In [8]:
only_null_session_id = both_user_kind[both_user_kind['kind_counts'] <= 1]['session_id'].unique()
two_kinds_session_id = both_user_kind[both_user_kind['kind_counts'] > 1]['session_id'].unique()

In [9]:
# null만 있는 hackle_merge
only_null = hackle_merge[hackle_merge['session_id'].isin(only_null_session_id)]

# 나머지 hackle_merge
the_rest = hackle_merge[~hackle_merge['session_id'].isin(only_null_session_id)]

In [10]:
# 내부데이터의 user list 추출해서 필터링
users = accounts_user['user_id'].unique()
filtered_the_rest = the_rest[the_rest['user_id'].isin(users)] # 필터링된 나머지 hackle

In [None]:
# 잘 필터링 됐는지 확인용 (1)
filtered_the_rest_user = filtered_the_rest.groupby(by='session_id').agg(user_count=('user_id', 'nunique')).reset_index().sort_values(by='user_count', ascending=False)
cond1 = filtered_the_rest_user['user_count'] > 1
filtered_the_rest_merge = filtered_the_rest_user[cond1]
print(f"필터링된 해클 이벤트에서 한 세션당 여러개의 유저아이디를 가진 세션 개수: {len(filtered_the_rest_merge)}")

필터링된 해클 이벤트에서 한 세션당 여러개의 유저아이디를 가진 세션 개수: 52


In [None]:
# 잘 필터링 됐는지 확인용 (2)
filtered_the_rest.head(10)

Unnamed: 0,session_id,user_count
947,040914e1-61ac-40ef-b76a-718066d880dc,5
65740,B941F9F9-CF53-4DAE-A204-75E666B5D277,3
55479,9803ebdb-0c89-4d93-b248-71fd1982e550,2
22852,3i1Ja5p3EOM5o7b1UQGL9MHchHp2,2
56146,9B0ABDE8-D744-4462-AF6D-876B2DE45CEA,2
34366,5c89abcf-a789-4944-8c26-da288ee48d2a,2
386,01D2BF59-D959-4381-8F2B-C979C003FF3E,2
98413,JZWG739A2gVOXuDZwx8l3MEF9zN2,2
71453,CF5F77A4-2C29-456F-9E23-68E403D0B960,2
106226,MFiut9juQMOsatlqKSWzWIvCsTw1,2


In [17]:
# 필터링한 나머지에는 null 없음
filtered_the_rest[filtered_the_rest['user_id'].isnull()]

Unnamed: 0,id_x,session_id,user_id,language,osname,osversion,versionname,device_id,event_id,event_datetime,event_key,id_y,item_name,page_name,friend_count,votes_count,heart_balance,question_id


In [None]:
# 나머지 concat으로 합치기
# 최종본

columns = filtered_the_rest.columns
columns = columns.drop(['id_x', 'id_y'])

final = pd.concat([filtered_the_rest[columns], only_null[columns]])

KeyError: 'id'

In [19]:
# 잘 합쳐졌는지 중복값으로 확인
final[final['session_id'].duplicated(keep=False)].sort_values(by=['session_id', 'user_id']).head(10)

Unnamed: 0,session_id,user_id,language,osname,osversion,versionname,device_id,event_id,event_datetime,event_key,item_name,page_name,friend_count,votes_count,heart_balance,question_id
24312479,000137bc-80de-4bb5-b61d-df7f217a4501,,ko,Android,13,2.0.3,000137bc-80de-4bb5-b61d-df7f217a4501,584085d4-95a9-47e4-a00b-070c433d0996,2023-07-21 14:23:33,launch_app,,,,,,
24312480,000137bc-80de-4bb5-b61d-df7f217a4501,,ko,Android,13,2.0.3,000137bc-80de-4bb5-b61d-df7f217a4501,ac63bbaa-a96e-49ce-8980-878fa1cb50de,2023-07-21 14:23:33,$session_start,,,,,,
3471045,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,006ae41e-52b2-4f81-aec8-16917d1bd364,2023-07-30 19:39:47,click_question_ask,,home,52.0,68.0,210.0,
3471046,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,072338f9-778b-48f6-b74d-97793fe4bf4c,2023-07-30 19:39:49,view_lab_tap,,,52.0,68.0,210.0,
3471047,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,09a2e4d8-d521-4b52-b325-631a3d3a73d9,2023-07-30 19:39:49,view_profile_tap,,,52.0,68.0,210.0,
3471048,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,0b7e4def-fb8d-4435-be8e-ad1124666734,2023-07-22 23:30:31,launch_app,,,52.0,68.0,210.0,
3471049,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,0ef36ccd-80fa-41c6-8b7c-607d61d21317,2023-07-30 19:40:09,click_bottom_navigation_profile,,,52.0,68.0,210.0,
3471050,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,11c3100d-0936-4100-96da-de169d29613f,2023-07-30 19:40:02,view_questions_tap,,,52.0,68.0,210.0,
3471051,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,127f5545-c25b-4330-81d5-c42106d6bbd0,2023-07-30 19:40:19,click_appbar_alarm_center,,,52.0,68.0,210.0,
3471052,00025EE1-BA46-4853-8FDD-B991FABA328F,1555160.0,ko-KR,iOS,16.5.1,2.0.5,00025EE1-BA46-4853-8FDD-B991FABA328F,150f6dd1-c04f-4190-86b4-a4714d2e0e2c,2023-07-30 19:40:09,view_timeline_tap,,,52.0,68.0,210.0,


In [21]:
test = final.groupby('session_id')['user_id'].apply(set).reset_index()
test['user_id'] = test['user_id'].apply(list)
test['kind_counts'] = test['user_id'].apply(len)
test

Unnamed: 0,session_id,user_id,kind_counts
0,000137bc-80de-4bb5-b61d-df7f217a4501,[nan],1
1,00025EE1-BA46-4853-8FDD-B991FABA328F,[1555160],1
2,00039F52-AC9A-4474-A645-DFE225C56753,[nan],1
3,0004482B-03B0-4638-8C5A-C2E49C085C7F,[nan],1
4,0004F43C-3A7A-4DE4-A02B-55AFDF07E9AD,[1189864],1
...,...,...,...
240089,zzwY14BTCLUB13lFw8g7FETq1Wn1,[1434640],1
240090,zzwY2oU4oAZcU7Y1Lr2Wqd8wGEo2,[1017419],1
240091,zzwdcJbazOPmYWLKoUFQKhvZTRu1,[1122713],1
240092,zzx2YLx97obWfJyKjGyXzmqQ97k2,[1313497],1


In [None]:
final.to_csv(PROCESSED_DIR / 'processed_hackle_merge.csv', index=False)

## 3-2. hackle_merge(2)
[session_id 중복값 & 특정 이벤트 키, 하트 이상치 드롭]

- session_id 내에서 user_id가 event_datetime(초단위), event_key가 겹치는걸 먼저 drop
- button, click_appbar_setting 전처리 대상
- 누적구매기록 최고값 이상을 가지고있는 user_id는 drop

In [None]:
hackle_merge = pd.read_csv(PROCESSED_DIR / 'processed_hackle_merge.csv')

  hackle_merge = pd.read_csv('./dumps/processed_csv/processed_hackle_merge.csv')


In [3]:
hackle_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20203847 entries, 0 to 20203846
Data columns (total 16 columns):
 #   Column          Dtype  
---  ------          -----  
 0   session_id      object 
 1   user_id         float64
 2   language        object 
 3   osname          object 
 4   osversion       object 
 5   versionname     object 
 6   device_id       object 
 7   event_id        object 
 8   event_datetime  object 
 9   event_key       object 
 10  item_name       object 
 11  page_name       object 
 12  friend_count    float64
 13  votes_count     float64
 14  heart_balance   float64
 15  question_id     float64
dtypes: float64(5), object(11)
memory usage: 2.4+ GB


In [None]:
# 데이터 타입 변경
hackle_merge['user_id'] = hackle_merge['user_id'].astype('Int64').astype(str)
hackle_merge['event_datetime'] = pd.to_datetime(hackle_merge['event_datetime']).dt.floor('s')

In [None]:
# session_id 내에서 user_id가 event_datetime(초단위), event_key가 겹치는걸 먼저 drop
hackle_merge.drop_duplicates(subset=['session_id', 'user_id', 'event_datetime', 'event_key'], keep='last', inplace=True)

In [11]:
# button, click_appbar_setting 전처리 대상
remove_list = ['button',
               'click_appbar_setting']
hackle_merge = hackle_merge[~hackle_merge['event_key'].isin(remove_list)]

In [12]:
#누적 구매기록 최고값 이상을 가지고있는 user_id는 drop
hackle_merge = hackle_merge[hackle_merge['heart_balance'] < 75977]

In [None]:
# 잘 전처리됐는지 테스트
hackle_merge[hackle_merge['heart_balance'] >= 75977]
hackle_merge[hackle_merge['event_key'] == 'click_appbar_setting']

Unnamed: 0,session_id,user_id,language,osname,osversion,versionname,device_id,event_id,event_datetime,event_key,item_name,page_name,friend_count,votes_count,heart_balance,question_id


In [None]:
hackle_merge.to_csv(PROCESSED_DIR / 'processed_hackle_merge.csv', index=False)

## 3-3. hackle_merge(3) 
[52개 세션 아이디 목록이 내부데이터에서도 이상치로 나옴 -> 내부데이터, hackle 모두 drop]

In [3]:
# session 당 user_id 여러개였던 52개 세션 아이디 목록
remove_users = ['947584', '974697', '1579712', '1578128', '1168569', '1577938',
       '1265729', '1308656', '1579521', '1192757', '1577954', '1579167',
       '1077681', '1572689', '1199005', '859876', '957661', '1184239',
       '1216565', '1279934', '1579057', '1579544', '1273256', '1218078',
       '1437875', '1200087', '1579212', '1211977', '1187499', '1572309',
       '1579609', '1199656', '1382387', '849763', '1006061', '1579564',
       '1390860', '1129431', '1579831', '1144883', '1491385', '1472888',
       '1526565', '1472203', '1298566', '1534869', '1391728', '1579123',
       '1493619', '1221007', '1086267', '1579522', '1123260', '1579185',
       '1578476', '1411626', '1579543', '1579245', '1351421', '1049018',
       '1579184', '1120312', '1066410', '1571870', '1562323', '1327236',
       '983082', '1577930', '1105237', '1577350', '1456760', '1579864',
       '1579780', '1579812', '838541', '1579339', '1579805', '1557061',
       '1166812', '1300433', '1579787', '1579612', '967618', '1129084',
       '1172318', '1120726', '1567305', '1324867', '1145452', '1356760',
       '967617', '1579680', '1579550', '1559744', '1573311', '1482081',
       '1061143', '1425325', '1579837', '1231394', '1040098', '1579246']

accounts_user = accounts_user[~accounts_user['user_id'].isin(remove_users)]
accounts_user.to_csv('./processed_accounts_user.csv', index=False)

In [None]:
hackle_merge = hackle_merge[~hackle_merge['user_id'].isin(remove_users)]
hackle_merge.to_csv(PROCESSED_DIR / 'processed_hackle_merge.csv', index=False)

# 4. Votes 추가 전처리

## 4-1. accounts_group(1) 
(user, group)   
(중복된 group_id 변경)

In [None]:
accounts_user = pd.read_csv(PROCESSED_DIR / 'processed_accounts_user.csv')
accounts_group = pd.read_csv(ORIG_DIR / 'accounts_group.csv')

In [37]:
accounts_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676978 entries, 0 to 676977
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             676978 non-null  int64  
 1   gender              676978 non-null  object 
 2   point               676978 non-null  int64  
 3   friend_id_list      674111 non-null  object 
 4   is_push_on          676978 non-null  int64  
 5   created_at          676978 non-null  object 
 6   block_user_id_list  676978 non-null  object 
 7   hide_user_id_list   676978 non-null  object 
 8   ban_status          676978 non-null  object 
 9   report_count        676978 non-null  int64  
 10  alarm_count         676978 non-null  int64  
 11  pending_chat        676978 non-null  int64  
 12  pending_votes       676978 non-null  int64  
 13  group_id            676978 non-null  float64
 14  friend_count        676978 non-null  int64  
dtypes: float64(1), int64(8), object(6)

In [38]:
accounts_user.head(2)

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
0,831962,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 14:18:56,[],[],N,253,40878,5499,110,12.0,43
1,832151,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 21:56:34,[],[],N,0,37,0,47,1.0,51


In [39]:
accounts_group.rename(columns={'id': 'group_id'}, inplace=True)
accounts_group['group_id'] = accounts_group['group_id'].astype(int).astype(str)

accounts_user['user_id'] = accounts_user['user_id'].astype(str)
accounts_user['group_id'] = accounts_user['group_id'].astype(int).astype(str)

In [40]:
accounts_group.sort_values(by=['school_id', 'group_id'], inplace=True)
dup_group_before = accounts_group[accounts_group.duplicated(subset=['school_id', 'grade', 'class_num'], keep=False)]
dup_group_before

Unnamed: 0,group_id,grade,class_num,school_id
58686,58711,1,4,4383
75735,75762,3,1,4383
76437,76464,1,4,4383
79779,79806,3,1,4383
58699,58724,1,1,5899
77093,77120,1,1,5899
58902,58927,3,1,5900
73208,73234,3,1,5900
80781,80809,2,20,5900
82087,82115,2,20,5900


In [41]:
# 바꿔야 하는 애들 편하게 보기 위해 merge로 null로 구분함
dup_group_after = dup_group_before.drop_duplicates(subset=['school_id', 'grade', 'class_num'], keep='first')
dup_group_final = pd.merge(dup_group_before, dup_group_after, on='group_id', how='left')
dup_group_final

Unnamed: 0,group_id,grade_x,class_num_x,school_id_x,grade_y,class_num_y,school_id_y
0,58711,1,4,4383,1.0,4.0,4383.0
1,75762,3,1,4383,3.0,1.0,4383.0
2,76464,1,4,4383,,,
3,79806,3,1,4383,,,
4,58724,1,1,5899,1.0,1.0,5899.0
5,77120,1,1,5899,,,
6,58927,3,1,5900,3.0,1.0,5900.0
7,73234,3,1,5900,,,
8,80809,2,20,5900,2.0,20.0,5900.0
9,82115,2,20,5900,,,


In [42]:
map_dup_group = {
    '76464': '58711',
    '79806': '75762',
    '77120': '58724',
    '73234': '58927',
    '82115': '80809'
}
accounts_group.sort_values(by=['school_id', 'group_id'], inplace=True)
accounts_group.drop_duplicates(subset=['school_id', 'grade', 'class_num'], keep='first', inplace=True)

# 특정 값을 바꿔주고 싶을 때 replace 사용
# map으로 하면 특정 값말고 다른 정상적인 값들은 모두 날라감
accounts_user['group_id'] = accounts_user['group_id'].replace(map_dup_group)

In [43]:
accounts_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676978 entries, 0 to 676977
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   user_id             676978 non-null  object
 1   gender              676978 non-null  object
 2   point               676978 non-null  int64 
 3   friend_id_list      674111 non-null  object
 4   is_push_on          676978 non-null  int64 
 5   created_at          676978 non-null  object
 6   block_user_id_list  676978 non-null  object
 7   hide_user_id_list   676978 non-null  object
 8   ban_status          676978 non-null  object
 9   report_count        676978 non-null  int64 
 10  alarm_count         676978 non-null  int64 
 11  pending_chat        676978 non-null  int64 
 12  pending_votes       676978 non-null  int64 
 13  group_id            676978 non-null  object
 14  friend_count        676978 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 77.5+ MB


In [45]:
accounts_user.head(3)

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
0,831962,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 14:18:56,[],[],N,253,40878,5499,110,12,43
1,832151,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 21:56:34,[],[],N,0,37,0,47,1,51
2,832340,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 21:56:35,[],[],N,0,19,0,21,1,57


In [47]:
accounts_user[accounts_user['group_id'] == '82115']

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count


In [48]:
accounts_user[accounts_user['group_id'] == '80809']

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
366042,1241840,F,595,"[1555843, 1550725, 1367431, 1538824, 1567368, ...",1,2023-05-14 10:10:52,[],[],N,0,0,0,17,80809,57
631036,1534664,M,300,"[1531583, 1549229, 1552046, 1574787]",0,2023-05-28 16:20:15,[],[],W,0,0,0,0,80809,4
647982,1552462,F,2732,"[1552502, 1552463, 1549229, 1555843, 1531583, ...",0,2023-05-30 20:43:34,[],[],N,0,1,0,246,80809,14
648021,1552502,F,9133,"[1555843, 1574020, 1550725, 1574788, 1367431, ...",1,2023-05-30 20:49:49,[],[],N,0,2,0,4,80809,57
660218,1565387,M,3441,"[1538531, 1555843, 1543237, 1561829, 1128933, ...",1,2023-06-03 17:40:16,[],[],N,0,4,0,115,80809,38
662021,1567368,F,265,"[1555843, 1574020, 1550725, 1367431, 1538824, ...",1,2023-06-04 14:27:11,[],[],N,0,0,0,1,80809,69
665430,1571151,F,664,"[1555843, 1574020, 1574788, 1538824, 1567368, ...",1,2023-06-07 01:01:20,[],[],N,0,0,0,20,80809,27
668019,1573988,M,192,"[1555843, 1567368, 1565387, 1571151, 1241840, ...",1,2023-06-10 23:36:37,[],[],N,0,2,0,0,80809,23
668068,1574040,F,444,"[1555843, 1574020, 1574788, 1560580, 1367431, ...",1,2023-06-11 00:39:31,[],[],N,0,0,0,0,80809,26
668757,1574788,F,4131,"[1555843, 1574020, 1567368, 1538824, 1576208, ...",1,2023-06-12 08:39:41,[],[],N,0,17,0,46,80809,28


In [49]:
accounts_user.to_csv('./processed_accounts_user.csv', index=False)

In [None]:
accounts_group = accounts_group[accounts_group['grade'] <= 3]
accounts_group.to_csv(PROCESSED_DIR / 'processed_accounts_group.csv', index=False)

## 4-2. accounts_group(2)   
(group)   
(활성화 group 유무 컬럼 추가)

In [None]:
accounts_user = pd.read_csv(PROCESSED_DIR / 'processed_accounts_user.csv')
accounts_group = pd.read_csv(ORIG_DIR / 'accounts_group.csv')
accounts_school = pd.read_csv(ORIG_DIR / 'accounts_school.csv')

In [None]:
funnel_user_cond = (accounts_user['created_at'] >= '2023-05-13') & (accounts_user['created_at'] <= '2024-05-07')
funnel_user = accounts_user[funnel_user_cond]

In [None]:
accounts_user['user_id'] = accounts_user['user_id'].astype(str)
accounts_user['group_id'] = accounts_user['group_id'].astype(str)

accounts_school['id'] = accounts_school['id'].astype(str)
accounts_group['group_id'] = accounts_group['group_id'].astype(str)
accounts_group['school_id'] = accounts_group['school_id'].astype(str)

In [None]:
test = pd.merge(funnel_user[['user_id', 'group_id']], accounts_group[['group_id', 'school_id']], on='group_id', how='left')
test = pd.merge(test, accounts_school[['school_id', 'student_count']], on='school_id', how='left')
test = test[test['student_count'] >= 40]
test

Unnamed: 0,user_id,group_id,school_id,student_count
0,855179,5532,5220,214.0
1,866522,7389,5256,198.0
2,916697,18094,1856,117.0
3,920278,16478,4609,284.0
4,921725,683,5214,67.0
...,...,...,...,...
363127,1583718,58635,2753,222.0
363129,1583720,30066,4642,392.0
363130,1583721,25568,3035,119.0
363132,1583723,63052,216,225.0


In [None]:
# 학교, 그룹 별 유저 수 확인
# 활성화 기준 그룹 당 유저 4명 이상을 넘는가?
students_count = test.groupby(['school_id', 'group_id'])['user_id'].count().reset_index(name='user_counts')
activate_school = students_count[students_count['user_counts'] >= 4]

activate_group_id = activate_school['group_id'].unique()

activate_school

Unnamed: 0,school_id,group_id,user_counts
0,10,35966,14
1,10,35997,10
2,10,43146,19
3,10,56412,18
4,10,63440,17
...,...,...,...
59328,999,72160,9
59329,999,72178,12
59330,999,72185,8
59331,999,72190,7


In [None]:
accounts_group['is_active'] = accounts_group['group_id'].apply(lambda x: 1 if x in activate_group_id else 0)
accounts_group

Unnamed: 0,group_id,grade,class_num,school_id,is_active
0,1,1,1,1,0
1,10,2,2,1,0
2,11,2,3,1,0
3,11373,1,2,1,0
4,1184,1,5,1,0
...,...,...,...,...,...
84503,84172,1,10,5965,0
84504,84173,1,1,5965,0
84505,84184,2,5,5965,0
84506,84225,1,7,5965,0


In [None]:
accounts_group.to_csv(PROCESSED_DIR / 'processed_accounts_group.csv', index=False)