In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 차원 축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth


# 학습 모델 저장을 위한 라이브러리
import pickle

In [3]:
# parquet 파일 데이터를 읽어온다.
df1 = pd.read_parquet('open/train/5.잔액정보/201807_train_잔액정보.parquet')
df2 = pd.read_parquet('open/train/5.잔액정보/201808_train_잔액정보.parquet')
df3 = pd.read_parquet('open/train/5.잔액정보/201809_train_잔액정보.parquet')
df4 = pd.read_parquet('open/train/5.잔액정보/201810_train_잔액정보.parquet')
df5 = pd.read_parquet('open/train/5.잔액정보/201811_train_잔액정보.parquet')
df6 = pd.read_parquet('open/train/5.잔액정보/201812_train_잔액정보.parquet')

In [4]:
# 월별 데이터 프레임을 합친다.
all_df = pd.concat([df1, df2, df3, df4, df5, df6])
all_df.reset_index(inplace=True, drop=True)
all_df

Unnamed: 0,기준년월,ID,잔액_일시불_B0M,잔액_할부_B0M,잔액_현금서비스_B0M,잔액_리볼빙일시불이월_B0M,잔액_리볼빙CA이월_B0M,잔액_카드론_B0M,월중평잔_일시불_B0M,월중평잔_할부_B0M,...,평잔_6M,평잔_일시불_6M,평잔_일시불_해외_6M,평잔_RV일시불_6M,평잔_RV일시불_해외_6M,평잔_할부_6M,평잔_할부_해외_6M,평잔_CA_6M,평잔_CA_해외_6M,평잔_카드론_6M
0,201807,TRAIN_000000,998,962,22971,0,0,0,1084,547,...,15988,2440,0,0,0,572,0,17008,0,0
1,201807,TRAIN_000001,2565,2390,0,0,0,0,4090,2553,...,7045,2677,0,2830,0,2736,0,0,0,0
2,201807,TRAIN_000002,5312,5113,21531,6795,0,0,5006,8778,...,66549,9118,0,8870,0,4429,0,43351,0,0
3,201807,TRAIN_000003,730,5025,26284,0,0,0,487,5607,...,30139,884,0,0,0,5097,0,30697,0,0
4,201807,TRAIN_000004,0,0,0,0,0,0,0,0,...,28,21,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2399996,201812,TRAIN_399996,3351,0,0,0,0,27337,4412,0,...,67826,12524,0,0,0,0,0,0,0,23031
2399997,201812,TRAIN_399997,2524,2960,0,0,0,0,2694,3374,...,8627,3241,0,0,0,3995,0,0,0,0
2399998,201812,TRAIN_399998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 82 columns):
 #   Column             Dtype  
---  ------             -----  
 0   기준년월               int64  
 1   ID                 object 
 2   잔액_일시불_B0M         int64  
 3   잔액_할부_B0M          int64  
 4   잔액_현금서비스_B0M       int64  
 5   잔액_리볼빙일시불이월_B0M    int64  
 6   잔액_리볼빙CA이월_B0M     int64  
 7   잔액_카드론_B0M         int64  
 8   월중평잔_일시불_B0M       int64  
 9   월중평잔_할부_B0M        int64  
 10  월중평잔_CA_B0M        int64  
 11  카드론잔액_최종경과월        int64  
 12  연체일자_B0M           float64
 13  연체잔액_B0M           int64  
 14  연체잔액_일시불_B0M       int64  
 15  연체잔액_할부_B0M        int64  
 16  연체잔액_현금서비스_B0M     int64  
 17  연체잔액_카드론_B0M       int64  
 18  연체잔액_대환론_B0M       int64  
 19  잔액_현금서비스_B1M       int64  
 20  잔액_현금서비스_B2M       int64  
 21  잔액_카드론_B1M         int64  
 22  잔액_카드론_B2M         int64  
 23  잔액_카드론_B3M         int64  
 24  잔액_카드론_B4M         int64  
 25  잔액_카드론_B5M        

In [11]:
# 소수점 6자리까지 출력하고 지수 표기 비활성화
pd.set_option('display.float_format', '{:.0f}'.format)

In [12]:
for i in range(1, 81) :
    a1= all_df.iloc[:,i].describe()
    print(a1)
    print('-'*30)

count          2400000
unique          400000
top       TRAIN_000000
freq                 6
Name: ID, dtype: object
------------------------------
count   2400000
mean       3303
std        6428
min           0
25%           0
50%        1298
75%        3778
max      222767
Name: 잔액_일시불_B0M, dtype: float64
------------------------------
count   2400000
mean       1448
std        3257
min           0
25%           0
50%           0
75%        1787
max       71465
Name: 잔액_할부_B0M, dtype: float64
------------------------------
count   2400000
mean        708
std        3639
min           0
25%           0
50%           0
75%           0
max       77356
Name: 잔액_현금서비스_B0M, dtype: float64
------------------------------
count   2400000
mean        349
std        2183
min           0
25%           0
50%           0
75%           0
max       37547
Name: 잔액_리볼빙일시불이월_B0M, dtype: float64
------------------------------
count   2400000
mean          7
std         411
min           0
25%           0

In [21]:
# 각 컬럼에서 0의 개수를 구하는 코드
for i in range(1, 104) :
    a1= all_df.iloc[:,i].value_counts()
    print(a1)
    print('-'*30)

ID
TRAIN_000000    6
TRAIN_266650    6
TRAIN_266672    6
TRAIN_266671    6
TRAIN_266670    6
               ..
TRAIN_133331    6
TRAIN_133330    6
TRAIN_133329    6
TRAIN_133328    6
TRAIN_399999    6
Name: count, Length: 400000, dtype: int64
------------------------------
잔액_일시불_B0M
0         684962
7           1270
8           1240
9           1112
10           968
           ...  
33163          1
41698          1
40967          1
164026         1
23438          1
Name: count, Length: 46915, dtype: int64
------------------------------
잔액_할부_B0M
0        1629476
1714         289
1809         286
1979         285
1583         282
          ...   
37836          1
26109          1
28214          1
20330          1
23638          1
Name: count, Length: 27079, dtype: int64
------------------------------
잔액_현금서비스_B0M
0        2281554
8166          25
8157          22
7732          21
6833          21
          ...   
32479          1
30566          1
20452          1
31597          1
3912

IndexError: single positional indexer is out-of-bounds

In [16]:
results = []

for i in range(0, all_df.shape[1]):  # 1번 컬럼부터
    col_name = all_df.columns[i]
    series = all_df.iloc[:, i]
    desc = series.describe()

    # 숫자형만 정수로 변환, object나 category는 그대로 출력
    if pd.api.types.is_numeric_dtype(series):
        desc = desc.round().astype(int)
        desc_str = f"[{col_name}]\n" + "\n".join(f"{idx:<6} {val:>10}" for idx, val in desc.items())
    else:
        desc_str = f"[{col_name}]\n" + desc.to_string()

    results.append(desc_str)

# DataFrame으로 정리 (엑셀에서 셀 하나에 붙여넣기 가능)
df_result = pd.DataFrame({'통계 요약': results})
df_result.to_csv("describe_summary_integer.csv", index=False, encoding='utf-8-sig')

In [18]:
import csv

results = []

for i in range(0, all_df.shape[1]):
    col_name = all_df.columns[i]
    series = all_df.iloc[:, i]
    desc = series.describe()

    if pd.api.types.is_numeric_dtype(series):
        desc = desc.round().astype(int)
        desc_str = f"[{col_name}]\n" + "\n".join(f"{idx:<6} {val:>10}" for idx, val in desc.items())
    else:
        desc_str = f"[{col_name}]\n" + desc.to_string()

    results.append(desc_str)

# DataFrame 만들기
df_result = pd.DataFrame({'통계 요약': results})

# ✅ 셀 안에 줄바꿈 유지한 채 저장
df_result.to_csv(
    "describe_summary_integer.csv",
    index=False,
    encoding='utf-8-sig',
    quoting=csv.QUOTE_ALL  # 줄바꿈 포함 문자열을 셀 하나에 넣기 위해 전체를 따옴표로 감쌈
)

In [32]:
a1 = all_df.iloc[:,2].value_counts()
print(a1.loc[7])

1270
