In [1]:
# 필수 라이브러리 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from sklearn.preprocessing import StandardScaler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# cp -r /content/drive/MyDrive/Colab\ Notebooks/final_project/data /content

In [3]:
# 파일 현재 위치
!pwd

/content


In [4]:
cp -r /content/drive/MyDrive/Colab\ Notebooks/final_project/data /content

In [None]:
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [5]:
# pyarrow 설치 (parquet 파일 읽기용)
!pip install pyarrow

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')



In [7]:
import pyarrow.parquet as pq

# 의심되는 파일이 실제 parquet인지 확인
pq.ParquetFile('data/test/2.신용정보/201807_test_신용정보.parquet')

<pyarrow.parquet.core.ParquetFile at 0x798796383610>

In [8]:
path = 'data/test/2.신용정보/'


In [9]:
# 데이터 로드: 6개월 신용정보 parquet 파일 병합
file_list = [
    '201807_test_신용정보.parquet',
    '201808_test_신용정보.parquet',
    '201809_test_신용정보.parquet',
    '201810_test_신용정보.parquet',
    '201811_test_신용정보.parquet',
    '201812_test_신용정보.parquet'
]

df_all = pd.concat([pd.read_parquet(path + f) for f in file_list], ignore_index=True)
print("병합 완료: ", df_all.shape)

병합 완료:  (600000, 42)


In [10]:
# 불필요한 컬럼 제거
cols_to_drop = ['자발한도감액후경과월', '강제한도감액후경과월', '한도증액후경과월', '카드론동의여부', '한도심사요청후경과월', '한도심사거절후경과월']
df_all.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print("컬럼 제거 완료")

컬럼 제거 완료


In [11]:
# 데이터 개요 확인
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 36 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   기준년월            600000 non-null  int64  
 1   ID              600000 non-null  object 
 2   최초한도금액          600000 non-null  int64  
 3   카드이용한도금액        600000 non-null  int64  
 4   CA한도금액          600000 non-null  int64  
 5   일시상환론한도금액       600000 non-null  int64  
 6   월상환론한도금액        600000 non-null  int64  
 7   CA이자율_할인전       600000 non-null  float64
 8   CL이자율_할인전       600000 non-null  float64
 9   RV일시불이자율_할인전    600000 non-null  float64
 10  RV현금서비스이자율_할인전  600000 non-null  float64
 11  RV신청일자          112482 non-null  float64
 12  RV약정청구율         600000 non-null  float64
 13  RV최소결제비율        600000 non-null  float64
 14  자발한도감액횟수_R12M   600000 non-null  object 
 15  자발한도감액금액_R12M   600000 non-null  int64  
 16  강제한도감액횟수_R12M   600000 non-null  int64  
 17  강제한도감액금액_R

In [12]:
# 결측치 비율 너무 높아서(81%) RV신청일자 컬럼 삭제
cols_to_drop = ['RV신청일자']
df_all.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print("컬럼 제거 완료")

컬럼 제거 완료


In [13]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 35 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   기준년월            600000 non-null  int64  
 1   ID              600000 non-null  object 
 2   최초한도금액          600000 non-null  int64  
 3   카드이용한도금액        600000 non-null  int64  
 4   CA한도금액          600000 non-null  int64  
 5   일시상환론한도금액       600000 non-null  int64  
 6   월상환론한도금액        600000 non-null  int64  
 7   CA이자율_할인전       600000 non-null  float64
 8   CL이자율_할인전       600000 non-null  float64
 9   RV일시불이자율_할인전    600000 non-null  float64
 10  RV현금서비스이자율_할인전  600000 non-null  float64
 11  RV약정청구율         600000 non-null  float64
 12  RV최소결제비율        600000 non-null  float64
 13  자발한도감액횟수_R12M   600000 non-null  object 
 14  자발한도감액금액_R12M   600000 non-null  int64  
 15  강제한도감액횟수_R12M   600000 non-null  int64  
 16  강제한도감액금액_R12M   600000 non-null  int64  
 17  한도증액횟수_R12

In [14]:
# RV신청일자 컬럼 삭제되었으니, RV약정청구율은 그대로 둔다
# RV전환가능여부 컬럼 결측치 -> N 값(0)으로 처리
df_all['RV전환가능여부'].value_counts(dropna=False)

Unnamed: 0_level_0,count
RV전환가능여부,Unnamed: 1_level_1
Z,490584
N,101782
,7634


In [15]:
# 전환 가능 여부 인코딩 및 결측치 처리
df_all['RV전환가능여부'] = df_all['RV전환가능여부'].map({'Z': 1, 'N': 0})
df_all['RV전환가능여부'] = df_all['RV전환가능여부'].fillna(0).astype(int)

In [16]:
# 결측치 재확인
missing_cols = df_all.columns[df_all.isnull().any()]
missing_cols

Index([], dtype='object')

In [None]:
# # 데이터 샘플 추출
# # 1. (선택) 데이터 샘플링: df_all에서 500개 샘플 추출
# df_all_sample = df_all.sample(n=500, random_state=42)  # 또는 head(n)

# # 2. CSV 파일로 저장 (인코딩은 Excel 호환 위해 cp949 사용)
# df_all_sample.to_csv('df_all_sample.csv', index=False, encoding='cp949')

# # 3. 로컬로 다운로드
# from google.colab import files
# files.download('df_all_sample.csv')

In [None]:
# # 1. 컬럼명 확인
# print(df_all_sample.columns.tolist())

# # 2. 데이터 미리보기
# print(df_all_sample.head(3))

In [None]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# from statsmodels.tools.tools import add_constant

# def calculate_vif(X):
#     X = add_constant(X)  # 상수항 추가 (intercept)
#     vif_data = pd.DataFrame()
#     vif_data["feature"] = X.columns
#     vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
#     return vif_data

In [None]:
# # parquet 파일 열기
# df1 = pd.read_parquet("201807_train_청구정보.parquet")

# # csv로 저장
# df1.to_csv("201807_train_청구정보.csv", index=False)


In [17]:
df_all

Unnamed: 0,기준년월,ID,최초한도금액,카드이용한도금액,CA한도금액,일시상환론한도금액,월상환론한도금액,CA이자율_할인전,CL이자율_할인전,RV일시불이자율_할인전,...,카드이용한도금액_B2M,특별한도보유여부_R3M,연체감액여부_R3M,한도심사요청건수,한도요청거절건수,시장단기연체여부_R6M,시장단기연체여부_R3M,시장연체상환여부_R6M,시장연체상환여부_R3M,rv최초시작후경과일
0,201807,TEST_00000,0,50902,18131,0,0,14.641712,11.894873,10.346302,...,50006,0,0,0회,0,0,0,0,0,99999999
1,201807,TEST_00001,0,50080,16819,0,0,18.992879,17.600092,14.021300,...,50003,0,0,0회,0,0,0,0,0,99999999
2,201807,TEST_00002,0,100045,30505,0,19499,14.823453,14.817385,10.679822,...,100056,0,0,0회,0,0,0,0,0,99999999
3,201807,TEST_00003,0,18508,6402,0,156102,15.121715,15.284812,11.674446,...,19693,0,0,0회,0,0,0,0,0,99999999
4,201807,TEST_00004,10000,4033,0,42195,52784,14.220050,10.897462,10.777585,...,3924,0,0,0회,0,0,0,0,0,99999999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,201812,TEST_99995,0,0,0,0,0,14.642972,11.898260,10.674317,...,0,0,0,0회,0,0,0,0,0,99999999
599996,201812,TEST_99996,0,49025,17876,48022,152797,14.221370,14.901302,10.324015,...,49990,0,0,0회,0,0,0,0,0,99999999
599997,201812,TEST_99997,0,29996,13332,89997,156313,15.207480,11.902432,11.495748,...,30004,0,0,0회,0,0,0,0,0,99999999
599998,201812,TEST_99998,0,42610,17362,90003,204480,15.268772,15.299862,11.264263,...,37139,0,0,0회,0,0,0,0,0,99999999


In [None]:
# # CSV 파일 불러오기
# df = pd.read_csv('df_all.csv', encoding='cp949') # 경로는 필요 시 수정

# # Parquet 파일로 저장
# df.to_parquet('df_all.parquet', index=False)

In [18]:
# Parquet로 저장
df_all.to_parquet("test_2번eda.parquet", index=False)

# 파일 다운로드
from google.colab import files
files.download("test_2번eda.parquet")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>