In [1]:
# 기본
import pandas as pd
import numpy as np

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

### 파일패스 설정 ⚠️ 이 부분 꼭 확인해서 설정한 뒤에 실행!!

In [8]:
data_type = "train"
month = "07"
category = "청구정보"

# local
root_path = '../data/open'

# colab

drive_folder = f'{root_path}/{data_type}/3.승인매출정보'

### 데이터 불러오기

In [7]:
file_name=f"{drive_folder}/2018{month}_{data_type}_{category}.parquet"
df=pd.read_parquet(file_name)
df

Unnamed: 0,기준년월,ID,최종이용일자_기본,최종이용일자_신판,최종이용일자_CA,최종이용일자_카드론,최종이용일자_체크,최종이용일자_일시불,최종이용일자_할부,이용건수_신용_B0M,...,승인거절건수_한도초과_B0M,승인거절건수_BL_B0M,승인거절건수_입력오류_B0M,승인거절건수_기타_B0M,승인거절건수_R3M,승인거절건수_한도초과_R3M,승인거절건수_BL_R3M,승인거절건수_입력오류_R3M,승인거절건수_기타_R3M,이용금액대
0,201807,TRAIN_000000,20180719,20180713,20180719,10101,20180203,20180709,20180713,11,...,0,0,0,0,3,3,0,0,0,01.100만원+
1,201807,TRAIN_000001,20180719,20180719,20170728,20170327,10101,20180719,20171231,13,...,0,0,0,0,3,3,0,0,0,03.30만원+
2,201807,TRAIN_000002,20180706,20180706,20180706,20151119,20141230,20180706,20180627,12,...,0,0,0,0,0,0,0,0,0,01.100만원+
3,201807,TRAIN_000003,20180721,20180715,20180721,10101,20141111,20180704,20180715,6,...,0,0,0,0,3,3,0,0,0,01.100만원+
4,201807,TRAIN_000004,20180124,20180124,10101,10101,20180512,20180124,10101,-2,...,0,0,0,0,0,0,0,0,0,09.미사용
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,201807,TRAIN_399995,20170217,20170217,10101,10101,20180719,20170217,20160501,0,...,0,0,0,0,0,0,0,0,0,09.미사용
399996,201807,TRAIN_399996,20180731,20180731,10101,20170112,10101,20180731,20180112,21,...,0,0,0,0,0,0,0,0,0,01.100만원+
399997,201807,TRAIN_399997,20180726,20180726,10101,10101,20131124,20180720,20180726,16,...,0,0,0,0,0,0,0,0,0,02.50만원+
399998,201807,TRAIN_399998,20161224,20161224,10101,10101,10101,20161224,20150122,-1,...,0,0,0,0,0,0,0,0,0,09.미사용


## 기본 전처리

#### 1. 결측치, 동일한값 컬럼 제거

In [None]:
# 모든 값이 동일한 컬럼 제거 (기준년도 컬럼 제외)
constant_cols = [col for col in df.columns.difference(['기준년월']) if df[col].nunique() == 1]
if constant_cols:
    print(f"🧹 동일값 컬럼 제거: {constant_cols}")
    df = df.drop(columns=constant_cols)

# 결측치가 있는 컬럼 제거
null_cols = df.columns[df.isnull().any()]
if len(null_cols) > 0:
    print(f"🧹 결측치 컬럼 제거: {list(null_cols)}")
    df = df.drop(columns=null_cols)

#### 2. Label Encoding (이용금액대 컬럼)

In [None]:
dict1 = {
    '09.미사용' : 0,
    '05.10만원-' : 1,
    '04.10만원+' : 2,
    '03.30만원+' : 3,
    '02.50만원+' : 4,
    '01.100만원+' : 5
}

# 범주형 인코딩
df['이용금액대'] = df['이용금액대'].map(dict1)
print(f"🔀 수치형변수로 인코딩")

#### 3. 타겟 One-hot Encoding (Segment 컬럼)

In [None]:
segment_folder = f'{root_path}/{data_type}/1.회원정보'
segment_category = "회원정보"

# 세그먼트 컬럼 추가
if data_type=="train":
    segment_df = pd.read_parquet(f"{segment_folder}/2018{month}_{data_type}_{segment_category}.parquet")
    print(f"🔀 세그먼트 병합")
    df = df.merge(segment_df[['ID', 'Segment']], on='ID', how='left')
    df=pd.get_dummies(df, columns=['Segment'])

## 기본 전처리된 파일 저장

In [10]:
output_file=f"{drive_folder}/2018{month}_processed_{category}.csv"
df.to_csv(output_file, index=False)