[ accounts user ]
1. id -> user_id 컬럼명 변경, str로 형변환
2. is_superuser, is_staff 컬럼 drop
    gender, group_id 결측 자연스럽게 사라짐
3. friend_id_list 형변환
4. friend_count 컬럼 생성

In [77]:
import pandas as pd
import numpy as np
from pathlib import Path
import ast


# 0) 경로 세팅

ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"


csv_path = DATA_DIR / "accounts_user.csv"

In [78]:
# 1) 로드

df = pd.read_csv(csv_path)
print("shape:", df.shape)

df.dtypes

shape: (677085, 16)


id                      int64
is_superuser            int64
is_staff                int64
gender                 object
point                   int64
friend_id_list         object
is_push_on              int64
created_at             object
block_user_id_list     object
hide_user_id_list      object
ban_status             object
report_count            int64
alarm_count             int64
pending_chat            int64
pending_votes           int64
group_id              float64
dtype: object

In [79]:
df.head()

Unnamed: 0,id,is_superuser,is_staff,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id
0,831956,1,1,,600,"[1292473, 913158, 1488461, 1064695, 1043565, 1...",0,2023-03-29 03:44:14.047130,[],[],N,0,0,0,0,
1,831962,0,0,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 05:18:56.162368,[],[],N,253,40878,5499,110,12.0
2,832151,0,0,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 12:56:34.989468,[],[],N,0,37,0,47,1.0
3,832340,0,0,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 12:56:35.020790,[],[],N,0,19,0,21,1.0
4,832520,0,0,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 12:56:35.049311,[],[],N,0,29,0,15,12.0


In [81]:
# 2) is_staff, is_superuser 값이 1인 유저 삭제

print("제거 전 행 수:", len(df))

before = len(df)

df = df[(df["is_staff"] == 0) & (df["is_superuser"] == 0)].copy()

after = len(df)

print("제거 후 행 수:", after)
print("제거된 행 수:", before - after)

제거 전 행 수: 677085
제거 후 행 수: 677081
제거된 행 수: 4


In [82]:
print("group_id 결측 제거 전 행 수:", len(df))
before = len(df)

df = df[df["group_id"].notna()].copy()

after = len(df)
print("group_id 결측 제거 후 행 수:", after)
print("제거된 행 수:", before - after)

group_id 결측 제거 전 행 수: 677081
group_id 결측 제거 후 행 수: 677080
제거된 행 수: 1


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 677080 entries, 1 to 677084
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  677080 non-null  int64  
 1   is_superuser        677080 non-null  int64  
 2   is_staff            677080 non-null  int64  
 3   gender              677080 non-null  object 
 4   point               677080 non-null  int64  
 5   friend_id_list      677080 non-null  object 
 6   is_push_on          677080 non-null  int64  
 7   created_at          677080 non-null  object 
 8   block_user_id_list  677080 non-null  object 
 9   hide_user_id_list   677080 non-null  object 
 10  ban_status          677080 non-null  object 
 11  report_count        677080 non-null  int64  
 12  alarm_count         677080 non-null  int64  
 13  pending_chat        677080 non-null  int64  
 14  pending_votes       677080 non-null  int64  
 15  group_id            677080 non-null  fl

In [84]:
null_summary = pd.DataFrame({
    "null_cnt": df.isna().sum()
}).sort_values("null_cnt", ascending=False)

null_summary

Unnamed: 0,null_cnt
id,0
is_superuser,0
is_staff,0
gender,0
point,0
friend_id_list,0
is_push_on,0
created_at,0
block_user_id_list,0
hide_user_id_list,0


In [85]:
# staff, superuser 이별
df = df.drop(columns=["is_staff", "is_superuser"], errors="ignore")

print("삭제 후 컬럼:", df.columns.tolist())

삭제 후 컬럼: ['id', 'gender', 'point', 'friend_id_list', 'is_push_on', 'created_at', 'block_user_id_list', 'hide_user_id_list', 'ban_status', 'report_count', 'alarm_count', 'pending_chat', 'pending_votes', 'group_id']


In [86]:
dup_all_cnt = df.duplicated(keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

if dup_all_cnt > 0:
    display(df[df.duplicated(keep=False)].head(20))

완전 동일 행 중복 수: 0


In [87]:
# 2) 컬럼/타입 정리

# id -> user_id, 그리고 str로
df = df.rename(columns={"id": "user_id"})
df["user_id"] = df["user_id"].astype("Int64").astype("string")  # 결측 안전

In [88]:
# 3) created_at datetime + ns 제거
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s")

In [89]:
# kst 변환

from datetime import timedelta

df['created_at'] = df['created_at'] + timedelta(hours=9)

In [90]:
# 3) 리스트처럼 생긴 문자열 -> 리스트

# '[1,2]' / '[]' / '' / NaN -> list or NaN
# 빈 값/빈 리스트는 NaN으로

def parse_listlike(x):

    if pd.isna(x):
        return np.nan
    s = str(x).strip()

    if s == "" or s == "[]" or s.lower() == "nan":
        return np.nan

    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            out = []
            for i in v:
                if i is None or (isinstance(i, float) and np.isnan(i)):
                    continue
                try:
                    out.append(int(i))
                except Exception:
                    out.append(i)
            return out if len(out) > 0 else np.nan
        return np.nan
    except Exception:
        return np.nan

In [91]:
# friend_count 생성
if "friend_id_list" in df.columns:
    df["friend_id_list"] = df["friend_id_list"].apply(parse_listlike)
    df["friend_count"] = df["friend_id_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [92]:
df.head(20)

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
1,831962,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 14:18:56,[],[],N,253,40878,5499,110,12.0,43
2,832151,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 21:56:34,[],[],N,0,37,0,47,1.0,51
3,832340,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 21:56:35,[],[],N,0,19,0,21,1.0,57
4,832520,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 21:56:35,[],[],N,0,29,0,15,12.0,18
5,832614,M,1048,"[838541, 833041, 832151, 837806, 1437874, 1142...",1,2023-03-29 21:56:35,[],[],N,0,28,0,14,12.0,21
6,832740,M,1094,"[874050, 849763, 832894, 832614, 837806, 83304...",0,2023-03-29 22:20:46,[],[],NB,0,26,0,3,12.0,15
7,832857,M,1439,"[874050, 832894, 832740, 832614, 837806, 83304...",1,2023-03-29 22:20:46,[],[],N,0,28,0,16,12.0,16
8,832894,M,1535,"[982531, 879496, 833041, 832151, 1082907, 1426...",1,2023-03-29 22:20:46,[],[],N,0,36,0,24,1.0,34
9,832920,F,213,"[982531, 882567, 836496, 833041, 836498, 83215...",1,2023-03-29 22:20:46,[],[],N,0,35,0,18,1.0,26
10,832986,M,305,"[838785, 1426466, 874050, 832740, 832894, 8326...",1,2023-03-29 22:20:46,[],[],N,0,26,0,19,12.0,18


In [93]:
print("gender 결측:", df["gender"].isna().sum())
print("group_id 결측:", df["group_id"].isna().sum())

gender 결측: 0
group_id 결측: 0


In [94]:
# 5) 저장

out_path = OUT_DIR / "accounts_user_clean.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_user_clean.csv
