In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
CLEAN_DIR = ROOT / "clean_vote_ver2"

In [2]:
df_fail = pd.read_csv(CLEAN_DIR / "accounts_failpaymenthistory_clean.csv")
df_pay = pd.read_csv(CLEAN_DIR / "accounts_paymenthistory_clean.csv")
df_user = pd.read_csv(CLEAN_DIR / "accounts_user_clean.csv")
df_poll = pd.read_csv(CLEAN_DIR / "polls_questionset_clean.csv")
df_contacts = pd.read_csv(CLEAN_DIR / "accounts_user_contacts_clean.csv")
df_school = pd.read_csv(CLEAN_DIR / "accounts_school_clean.csv")

In [3]:
for name, df in {
    "fail": df_fail,
    "pay": df_pay,
    "user": df_user,
    "poll": df_poll,
    "contacts": df_contacts,
    "school": df_school
}.items():
    print(name, ":", df.shape)

fail : (160, 5)
pay : (95140, 4)
user : (677080, 15)
poll : (157705, 6)
contacts : (5063, 5)
school : (5951, 6)


In [4]:
tables = {
    "accounts_failpaymenthistory": df_fail,
    "accounts_paymenthistory": df_pay,
    "accounts_user": df_user,
    "polls_questionset": df_poll,
    "accounts_user_contacts": df_contacts,
    "accounts_school": df_school,
}

In [7]:
for name, df in tables.items():
    if "created_at" in df.columns:
        dt = pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s")

        print(f"\n[{name}]")
        print("최소 날짜:", dt.min())
        print("최대 날짜:", dt.max())
        print("파싱 실패:", dt.isna().sum())


[accounts_failpaymenthistory]
최소 날짜: 2023-05-14 05:49:22
최대 날짜: 2023-09-17 09:12:53
파싱 실패: 0

[accounts_paymenthistory]
최소 날짜: 2023-05-13 21:28:34
최대 날짜: 2024-05-08 14:12:45
파싱 실패: 0

[accounts_user]
최소 날짜: 2023-03-29 14:18:56
최대 날짜: 2024-05-09 17:31:17
파싱 실패: 0

[polls_questionset]
최소 날짜: 2023-04-28 12:28:07
최대 날짜: 2024-05-07 11:32:30
파싱 실패: 0


[accounts_failpaymenthistory 전처리 기준]

1. 전체 row 중복: 없음
2. user_id 기준 중복 결제 기록 존재
    중복 시 최신 created_at 기준 keep='last'
3. 컬럼 유지: id, user_id, productId, phone_type, created_at
4. create_at 데이터 파싱, ns 삭제
5. 결과 저장 위치: clean_vote_ver2/

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"

csv_path = DATA_DIR / "accounts_failpaymenthistory.csv"



df = pd.read_csv(csv_path)

print("shape:", df.shape)
df.head()
df.dtypes

shape: (163, 5)


id             int64
productId     object
phone_type    object
created_at    object
user_id        int64
dtype: object

In [None]:
df["created_at"] = (pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s"))

summary = pd.DataFrame([{
    "row_cnt": len(df),
    "null_id": df["id"].isna().sum(),
    "null_user_id": df["user_id"].isna().sum(),
    "null_productId": df["productId"].isna().sum(),
    "null_phone_type": df["phone_type"].isna().sum(),
    "null_created_at": df["created_at"].isna().sum(),
    "non_positive_id": (df["id"] <= 0).sum(),
    "non_positive_user_id": (df["user_id"] <= 0).sum(),
}])
summary

Unnamed: 0,row_cnt,null_id,null_user_id,null_productId,null_phone_type,null_created_at,non_positive_id,non_positive_user_id
0,163,0,0,107,0,0,0,0


In [None]:
bad_dt = df[df["created_at"].isna()][["id", "user_id", "productId", "phone_type", "created_at"]]

print("created_at 파싱 실패 건수:", len(bad_dt))
bad_dt.head(20)

print("created_at min:", df["created_at"].min())
print("created_at max:", df["created_at"].max())

created_at 파싱 실패 건수: 0
created_at min: 2023-05-14 05:49:22
created_at max: 2023-09-17 09:12:53


In [None]:
dup_all_cnt = df.duplicated(keep=False).sum()
print("완전 동일 행 중복(keep=False):", dup_all_cnt)

완전 동일 행 중복(keep=False): 0


In [None]:
df_clean = (
    df.sort_values(["user_id", "created_at"], ascending=[True, True])
    .drop_duplicates(subset=["user_id"], keep="last")
    .reset_index(drop=True)
)

print("원본 행 수:", len(df))
print("정리 후 행 수:", len(df_clean))
print("제거된 행 수:", len(df) - len(df_clean))
print("유니크 여부:", df_clean["user_id"].is_unique)

원본 행 수: 163
정리 후 행 수: 160
제거된 행 수: 3
user_id unique?: True


In [None]:
out_path = OUT_DIR / "accounts_failpaymenthistory_clean.csv"
df_clean[["id", "user_id", "productId", "phone_type", "created_at"]].to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)


saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_failpaymenthistory_clean.csv


[ accounts_paymenthistory ]
1. 데이터 수집 기간
    min   2023-05-13 21:28:34
    max   2024-05-08 14:12:45
2. create_at 파싱
    ns 삭제

In [1]:
import pandas as pd
from pathlib import Path


ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"

csv_path = DATA_DIR / "accounts_paymenthistory.csv"

In [None]:
df = pd.read_csv(csv_path)

print("shape:", df.shape)
df.head()
df.dtypes

shape: (95140, 5)


id             int64
productId     object
phone_type    object
created_at    object
user_id        int64
dtype: object

In [None]:
summary = pd.DataFrame([{
    "row_cnt": len(df),
    "null_user_id": df["user_id"].isna().sum(),
    "null_productId": df["productId"].isna().sum(),
    "null_phone_type": df["phone_type"].isna().sum(),
    "null_created_at": df["created_at"].isna().sum(),
    "duplicate_all_rows": df.duplicated(keep=False).sum(),
    "duplicate_user_id": df.duplicated(subset=["user_id"], keep=False).sum(),
}])

summary

Unnamed: 0,row_cnt,null_user_id,null_productId,null_phone_type,null_created_at,duplicate_all_rows,duplicate_user_id
0,95140,0,0,0,0,0,52091


In [6]:
df["user_id"].nunique()

59192

In [None]:
df["created_at"] = (
    pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s")
)

print("created_at dtype:", df["created_at"].dtype)
print("created_at min/max:")
df["created_at"].agg(["min", "max"])

created_at dtype: datetime64[ns]
created_at min/max:


min   2023-05-13 21:28:34
max   2024-05-08 14:12:45
Name: created_at, dtype: datetime64[ns]

In [None]:
out_path = OUT_DIR / "accounts_paymenthistory_clean.csv"

df[["user_id", "productId", "phone_type", "created_at"]].to_csv(
    out_path,
    index=False,
    encoding="utf-8-sig"
)
print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_paymenthistory_clean.csv


[ accounts_school ]
1. 주소 정제
    담연님 코드 참고('서울'과 '서울 ' 구분 주의)
2. 학생 수가 40명이 넘는 곳과 그렇지 않는 곳을 구분하고자 한다고 말씀주셔서, 추가 컬럼 생성
    is_active_school: 학생 수가 40 이상이면 True, 미만이면 False 

In [1]:
import pandas as pd
from pathlib import Path

ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"

In [27]:
df = pd.read_csv(DATA_DIR / "accounts_school.csv")

print("shape:", df.shape)
df.head()
df.dtypes

shape: (5951, 4)


id                int64
address          object
student_count     int64
school_type      object
dtype: object

In [28]:
null_summary = df.isna().sum().to_frame("null_cnt")
null_summary["null_ratio"] = null_summary["null_cnt"] / len(df)

null_summary

Unnamed: 0,null_cnt,null_ratio
id,0,0.0
address,0,0.0
student_count,0,0.0
school_type,0,0.0


In [4]:
dup_all_cnt = df.duplicated(keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

완전 동일 행 중복 수: 0


In [None]:
def clean_address(addr):
    if pd.isna(addr) or addr == '-':
        return None

    addr = addr.replace('대한민국 ', '')

    city_map = {
        '서울 ': '서울특별시',
        '경기 ': '경기도',
        '인천 ': '인천광역시',
        '대전 ': '대전광역시',
        '대구 ': '대구광역시',
        '부산 ': '부산광역시',
        '울산 ': '울산광역시',
        '광주 ': '광주광역시',
        '강원 ': '강원도',
        '충남 ': '충청남도',
        '충북 ': '충청북도',
        '경남 ': '경상남도',
        '경북 ': '경상북도',
        '전남 ': '전라남도',
        '전북 ': '전라북도',
        '제주 ': '제주특별자치도'
    }

    for short, long in city_map.items():
        if addr.startswith(short):
            return addr.replace(short, long, 1)

    return addr

df["address_clean"] = df["address"].apply(clean_address)

In [None]:
changed_cnt = (df["address"] != df["address_clean"]).sum()
print("주소가 변경된 행 수:", changed_cnt)

주소가 변경된 행 수: 31


In [7]:
df["address_clean"].value_counts().head(10)

address_clean
경기도 화성시        78
경기도 부천시        60
경기도 남양주시       58
경상남도 김해시       58
대구광역시 달서구      56
서울특별시 노원구      54
경기도 성남시 분당구    53
서울특별시 송파구      52
제주특별자치도 제주시    52
광주광역시 북구       52
Name: count, dtype: int64

In [None]:
df["is_active_school"] = df["student_count"] >= 40

In [33]:
df.head(10)

Unnamed: 0,id,address,student_count,school_type,is_active_school
0,4,충청북도 충주시,239,H,True
1,5,충청북도 충주시,160,M,True
2,6,충청북도 충주시,200,H,True
3,7,충청북도 충주시,114,H,True
4,8,충청북도 충주시,139,M,True
5,9,충청북도 충주시,3,H,False
6,10,충청북도 충주시,159,M,True
7,11,충청북도 충주시,17,M,False
8,12,충청북도 충주시,154,M,True
9,13,충청북도 충주시,80,H,True


In [None]:
out_path = OUT_DIR / "accounts_school_clean.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_school_clean.csv


[ accounts_user_contacts ]
1. 유저 아이디 형변환: int -> str
2. 그냥 아이디는 그대로 둠
3. invite user id list는 리스트로 형변환, 빈 리스트는 null로 대체
4. invite cnt 생성

In [39]:
import pandas as pd
import numpy as np
from pathlib import Path
import ast

ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"

csv_path = DATA_DIR / "accounts_user_contacts.csv"

In [40]:
df = pd.read_csv(csv_path)

print("shape:", df.shape)
df.head()
df.dtypes

shape: (5063, 4)


id                      int64
contacts_count          int64
invite_user_id_list    object
user_id                 int64
dtype: object

In [41]:
summary = pd.DataFrame([{
    "row_cnt": len(df),
    "null_id": df["id"].isna().sum(),
    "null_user_id": df["user_id"].isna().sum(),
    "null_contacts_count": df["contacts_count"].isna().sum(),
    "null_invite_user_id_list": df["invite_user_id_list"].isna().sum(),
}])
summary

Unnamed: 0,row_cnt,null_id,null_user_id,null_contacts_count,null_invite_user_id_list
0,5063,0,0,0,0


In [42]:
df["user_id"] = df["user_id"].astype("Int64").astype("string")

In [43]:
print(df["user_id"].dtype)
df[["user_id"]].head()

string


Unnamed: 0,user_id
0,1167696
1,863169
2,857205
3,851431
4,855476


In [None]:
neg_contacts = df[df["contacts_count"] < 0]
print("contacts_count 음수 건수:", len(neg_contacts))
display(neg_contacts.head(20))

contacts_count 음수 건수: 0


Unnamed: 0,id,contacts_count,invite_user_id_list,user_id


In [None]:
def parse_invite_list(x):
    if pd.isna(x):
        return np.nan
    try:
        v = ast.literal_eval(x)
        if isinstance(v, list) and len(v) > 0:
            return [int(i) for i in v if pd.notna(i)]
        else:
            return np.nan
    except Exception:
        return np.nan

df["invite_user_id_list"] = df["invite_user_id_list"].apply(parse_invite_list)

In [54]:
df[["invite_user_id_list"]].head(20)

Unnamed: 0,invite_user_id_list
0,
1,
2,[854615]
3,
4,[849318]
5,[855829]
6,"[849318, 849421]"
7,
8,"[855626, 856042, 837947]"
9,


In [None]:
df["invite_cnt"] = df["invite_user_id_list"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

In [56]:
type(df.loc[0, "invite_user_id_list"])

float

In [None]:
df["invite_user_id_list"].apply(type).value_counts()

invite_user_id_list
<class 'float'>    3905
<class 'list'>     1158
Name: count, dtype: int64

In [None]:
dup_cols = ["id", "user_id", "contacts_count"]

dup_all_cnt = df.duplicated(subset=dup_cols, keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

dup_user_cnt = df.duplicated(subset=["user_id"], keep=False).sum()
print("user_id 중복 행 수:", dup_user_cnt)

완전 동일 행 중복 수: 0
user_id 중복 행 수: 0


In [59]:
df.head(20)

Unnamed: 0,id,contacts_count,invite_user_id_list,user_id,invite_cnt
0,259,30,,1167696,0
1,1756,79,,863169,0
2,13742,21,[854615],857205,1
3,13754,29,,851431,0
4,13756,28,[849318],855476,1
5,13784,31,[855829],1482744,1
6,13798,45,"[849318, 849421]",854615,2
7,13807,28,,854372,0
8,13815,26,"[855626, 856042, 837947]",858674,3
9,21155,28,,855526,0


In [None]:
out_path = OUT_DIR / "accounts_user_contacts_clean.csv"

df_out = df[["id", "user_id", "contacts_count", "invite_user_id_list", "invite_cnt"]].copy()

df_out.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_user_contacts_clean.csv


[ accounts user ]
1. id -> user_id 컬럼명 변경, str로 형변환
2. is_superuser, is_staff 컬럼 drop
    gender, group_id 결측 자연스럽게 사라짐
3. friend_id_list 형변환
4. friend_count 컬럼 생성

In [77]:
import pandas as pd
import numpy as np
from pathlib import Path
import ast



ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"


csv_path = DATA_DIR / "accounts_user.csv"

In [None]:
df = pd.read_csv(csv_path)
print("shape:", df.shape)

df.dtypes

shape: (677085, 16)


id                      int64
is_superuser            int64
is_staff                int64
gender                 object
point                   int64
friend_id_list         object
is_push_on              int64
created_at             object
block_user_id_list     object
hide_user_id_list      object
ban_status             object
report_count            int64
alarm_count             int64
pending_chat            int64
pending_votes           int64
group_id              float64
dtype: object

In [79]:
df.head()

Unnamed: 0,id,is_superuser,is_staff,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id
0,831956,1,1,,600,"[1292473, 913158, 1488461, 1064695, 1043565, 1...",0,2023-03-29 03:44:14.047130,[],[],N,0,0,0,0,
1,831962,0,0,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 05:18:56.162368,[],[],N,253,40878,5499,110,12.0
2,832151,0,0,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 12:56:34.989468,[],[],N,0,37,0,47,1.0
3,832340,0,0,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 12:56:35.020790,[],[],N,0,19,0,21,1.0
4,832520,0,0,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 12:56:35.049311,[],[],N,0,29,0,15,12.0


In [None]:
print("제거 전 행 수:", len(df))

before = len(df)

df = df[(df["is_staff"] == 0) & (df["is_superuser"] == 0)].copy()

after = len(df)

print("제거 후 행 수:", after)
print("제거된 행 수:", before - after)

제거 전 행 수: 677085
제거 후 행 수: 677081
제거된 행 수: 4


In [82]:
print("group_id 결측 제거 전 행 수:", len(df))
before = len(df)

df = df[df["group_id"].notna()].copy()

after = len(df)
print("group_id 결측 제거 후 행 수:", after)
print("제거된 행 수:", before - after)

group_id 결측 제거 전 행 수: 677081
group_id 결측 제거 후 행 수: 677080
제거된 행 수: 1


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 677080 entries, 1 to 677084
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  677080 non-null  int64  
 1   is_superuser        677080 non-null  int64  
 2   is_staff            677080 non-null  int64  
 3   gender              677080 non-null  object 
 4   point               677080 non-null  int64  
 5   friend_id_list      677080 non-null  object 
 6   is_push_on          677080 non-null  int64  
 7   created_at          677080 non-null  object 
 8   block_user_id_list  677080 non-null  object 
 9   hide_user_id_list   677080 non-null  object 
 10  ban_status          677080 non-null  object 
 11  report_count        677080 non-null  int64  
 12  alarm_count         677080 non-null  int64  
 13  pending_chat        677080 non-null  int64  
 14  pending_votes       677080 non-null  int64  
 15  group_id            677080 non-null  fl

In [84]:
null_summary = pd.DataFrame({
    "null_cnt": df.isna().sum()
}).sort_values("null_cnt", ascending=False)

null_summary

Unnamed: 0,null_cnt
id,0
is_superuser,0
is_staff,0
gender,0
point,0
friend_id_list,0
is_push_on,0
created_at,0
block_user_id_list,0
hide_user_id_list,0


In [85]:
df = df.drop(columns=["is_staff", "is_superuser"], errors="ignore")

print("삭제 후 컬럼:", df.columns.tolist())

삭제 후 컬럼: ['id', 'gender', 'point', 'friend_id_list', 'is_push_on', 'created_at', 'block_user_id_list', 'hide_user_id_list', 'ban_status', 'report_count', 'alarm_count', 'pending_chat', 'pending_votes', 'group_id']


In [86]:
dup_all_cnt = df.duplicated(keep=False).sum()
print("완전 동일 행 중복 수:", dup_all_cnt)

if dup_all_cnt > 0:
    display(df[df.duplicated(keep=False)].head(20))

완전 동일 행 중복 수: 0


In [None]:
df = df.rename(columns={"id": "user_id"})
df["user_id"] = df["user_id"].astype("Int64").astype("string")  # 결측 안전

In [88]:
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce").dt.floor("s")

In [None]:
from datetime import timedelta

df['created_at'] = df['created_at'] + timedelta(hours=9)

In [None]:
def parse_listlike(x):

    if pd.isna(x):
        return np.nan
    s = str(x).strip()

    if s == "" or s == "[]" or s.lower() == "nan":
        return np.nan

    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            out = []
            for i in v:
                if i is None or (isinstance(i, float) and np.isnan(i)):
                    continue
                try:
                    out.append(int(i))
                except Exception:
                    out.append(i)
            return out if len(out) > 0 else np.nan
        return np.nan
    except Exception:
        return np.nan

In [91]:
if "friend_id_list" in df.columns:
    df["friend_id_list"] = df["friend_id_list"].apply(parse_listlike)
    df["friend_count"] = df["friend_id_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [92]:
df.head(20)

Unnamed: 0,user_id,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id,friend_count
1,831962,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 14:18:56,[],[],N,253,40878,5499,110,12.0,43
2,832151,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 21:56:34,[],[],N,0,37,0,47,1.0,51
3,832340,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 21:56:35,[],[],N,0,19,0,21,1.0,57
4,832520,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 21:56:35,[],[],N,0,29,0,15,12.0,18
5,832614,M,1048,"[838541, 833041, 832151, 837806, 1437874, 1142...",1,2023-03-29 21:56:35,[],[],N,0,28,0,14,12.0,21
6,832740,M,1094,"[874050, 849763, 832894, 832614, 837806, 83304...",0,2023-03-29 22:20:46,[],[],NB,0,26,0,3,12.0,15
7,832857,M,1439,"[874050, 832894, 832740, 832614, 837806, 83304...",1,2023-03-29 22:20:46,[],[],N,0,28,0,16,12.0,16
8,832894,M,1535,"[982531, 879496, 833041, 832151, 1082907, 1426...",1,2023-03-29 22:20:46,[],[],N,0,36,0,24,1.0,34
9,832920,F,213,"[982531, 882567, 836496, 833041, 836498, 83215...",1,2023-03-29 22:20:46,[],[],N,0,35,0,18,1.0,26
10,832986,M,305,"[838785, 1426466, 874050, 832740, 832894, 8326...",1,2023-03-29 22:20:46,[],[],N,0,26,0,19,12.0,18


In [93]:
print("gender 결측:", df["gender"].isna().sum())
print("group_id 결측:", df["group_id"].isna().sum())

gender 결측: 0
group_id 결측: 0


In [None]:
out_path = OUT_DIR / "accounts_user_clean.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/accounts_user_clean.csv


[ polls_questionset ]
1. create at 시간 파싱
    ns 제거
2. create > open 이상치 679건 제거

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


ROOT = Path.cwd()
DATA_DIR = ROOT / "dump_vote_ver2"
OUT_DIR = ROOT / "clean_vote_ver2"

csv_path = DATA_DIR / "polls_questionset.csv"

In [None]:
df = pd.read_csv(csv_path)

print("shape:", df.shape)
df.head()
df.dtypes

shape: (158384, 6)


id                         int64
question_piece_id_list    object
opening_time              object
status                    object
created_at                object
user_id                    int64
dtype: object

In [None]:
for col in ["created_at", "opening_time"]:
    df[col] = (
        pd.to_datetime(df[col], errors="coerce")
        .dt.floor("s")
    )

In [None]:
from datetime import timedelta

df['created_at'] = df['created_at'] + timedelta(hours=9)

In [None]:
print("created_at null:", df["created_at"].isna().sum())
print("opening_time null:", df["opening_time"].isna().sum())

created_at null: 0
opening_time null: 0


In [None]:
invalid_time = df["created_at"] > df["opening_time"]

print("created_at > opening_time 건수:", invalid_time.sum())

df_clean = df.loc[~invalid_time].reset_index(drop=True)

print("원본 행 수:", len(df))
print("정리 후 행 수:", len(df_clean))
print("삭제된 행 수:", len(df) - len(df_clean))

created_at > opening_time 건수: 679
원본 행 수: 158384
정리 후 행 수: 157705
삭제된 행 수: 679


In [None]:
out_path = OUT_DIR / "polls_questionset_clean.csv"
df_clean.to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)

saved: /Users/10moo/260128_proj/clean_vote_ver2/polls_questionset_clean.csv
