In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 차원 축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth


# 학습 모델 저장을 위한 라이브러리
import pickle

In [2]:
# parquet 파일 데이터를 읽어온다.
df1 = pd.read_parquet('open/train/6.채널정보/201807_train_채널정보.parquet')
df2 = pd.read_parquet('open/train/6.채널정보/201808_train_채널정보.parquet')
df3 = pd.read_parquet('open/train/6.채널정보/201809_train_채널정보.parquet')
df4 = pd.read_parquet('open/train/6.채널정보/201810_train_채널정보.parquet')
df5 = pd.read_parquet('open/train/6.채널정보/201811_train_채널정보.parquet')
df6 = pd.read_parquet('open/train/6.채널정보/201812_train_채널정보.parquet')

In [3]:
# 월별 데이터 프레임을 합친다.
all_df = pd.concat([df1, df2, df3, df4, df5, df6])
all_df.reset_index(inplace=True, drop=True)
all_df

Unnamed: 0,기준년월,ID,인입횟수_ARS_R6M,이용메뉴건수_ARS_R6M,인입일수_ARS_R6M,인입월수_ARS_R6M,인입후경과월_ARS,인입횟수_ARS_B0M,이용메뉴건수_ARS_B0M,인입일수_ARS_B0M,...,당사PAY_방문횟수_R6M,당사PAY_방문월수_R6M,당사멤버쉽_방문횟수_B0M,당사멤버쉽_방문횟수_R6M,당사멤버쉽_방문월수_R6M,OS구분코드,홈페이지_금융건수_R6M,홈페이지_선결제건수_R6M,홈페이지_금융건수_R3M,홈페이지_선결제건수_R3M
0,201807,TRAIN_000000,10회 이상,10회 이상,8,6,0,2,6,2,...,0,0,22,221,6,Android,0,0,0,0
1,201807,TRAIN_000001,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,0,,0,0,0,0
2,201807,TRAIN_000002,1회 이상,1회 이상,1,1,0,2,5,1,...,0,0,0,0,0,Android,11,6,5,5
3,201807,TRAIN_000003,10회 이상,10회 이상,10,6,0,2,6,2,...,0,0,23,219,6,Android,0,0,0,0
4,201807,TRAIN_000004,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,0,Android,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,0,,0,0,0,0
2399996,201812,TRAIN_399996,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,1,,0,0,0,0
2399997,201812,TRAIN_399997,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,0,,0,0,0,0
2399998,201812,TRAIN_399998,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,0,,0,0,0,0


In [6]:
# info 출력 시 컬럼 생략하지 않도록 설정
pd.set_option('display.max_info_columns', 200)  # 200개 이상 컬럼이 있어도 다 보여줘
pd.set_option('display.max_columns', None)      # 모든 컬럼 표시 (보조 설정)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 105 columns):
 #    Column             Dtype 
---   ------             ----- 
 0    기준년월               int64 
 1    ID                 object
 2    인입횟수_ARS_R6M       object
 3    이용메뉴건수_ARS_R6M     object
 4    인입일수_ARS_R6M       int64 
 5    인입월수_ARS_R6M       int64 
 6    인입후경과월_ARS         int64 
 7    인입횟수_ARS_B0M       int64 
 8    이용메뉴건수_ARS_B0M     int64 
 9    인입일수_ARS_B0M       int64 
 10   방문횟수_PC_R6M        object
 11   방문일수_PC_R6M        object
 12   방문월수_PC_R6M        int64 
 13   방문후경과월_PC_R6M      int64 
 14   방문횟수_앱_R6M         object
 15   방문일수_앱_R6M         int64 
 16   방문월수_앱_R6M         int64 
 17   방문후경과월_앱_R6M       int64 
 18   방문횟수_모바일웹_R6M      int64 
 19   방문일수_모바일웹_R6M      int64 
 20   방문월수_모바일웹_R6M      int64 
 21   방문후경과월_모바일웹_R6M    int64 
 22   방문횟수_PC_B0M        int64 
 23   방문일수_PC_B0M        int64 
 24   방문횟수_앱_B0M         int64 
 25   방문일수_앱_B0M      

In [25]:
missing_summary = all_df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]  # 결측치가 있는 컬럼만 필터링
print(missing_summary)

OS구분코드    1633566
dtype: int64


In [7]:
# 소수점 6자리까지 출력하고 지수 표기 비활성화
pd.set_option('display.float_format', '{:.0f}'.format)

In [8]:
for i in range(1, 104) :
    a1= all_df.iloc[:,i].describe()
    print(a1)
    print('-'*30)

count          2400000
unique          400000
top       TRAIN_000000
freq                 6
Name: ID, dtype: object
------------------------------
count     2400000
unique          2
top         1회 이상
freq      2328933
Name: 인입횟수_ARS_R6M, dtype: object
------------------------------
count     2400000
unique          4
top         1회 이상
freq      2313787
Name: 이용메뉴건수_ARS_R6M, dtype: object
------------------------------
count   2400000
mean          1
std           2
min           0
25%           0
50%           0
75%           1
max          12
Name: 인입일수_ARS_R6M, dtype: float64
------------------------------
count   2400000
mean          1
std           1
min           0
25%           0
50%           0
75%           1
max           6
Name: 인입월수_ARS_R6M, dtype: float64
------------------------------
count   2400000
mean          1
std           1
min           0
25%           0
50%           0
75%           0
max           5
Name: 인입후경과월_ARS, dtype: float64
----------------------------

In [12]:
all_df['방문월수_PC_R6M'].value_counts()

방문월수_PC_R6M
0    2064931
1     173058
6     161856
2        151
3          3
5          1
Name: count, dtype: int64

In [19]:
for i in range(1, 104) :
    a1= all_df.iloc[:,i].value_counts()
    print(a1)
    print('-'*30)

ID
TRAIN_000000    6
TRAIN_266650    6
TRAIN_266672    6
TRAIN_266671    6
TRAIN_266670    6
               ..
TRAIN_133331    6
TRAIN_133330    6
TRAIN_133329    6
TRAIN_133328    6
TRAIN_399999    6
Name: count, Length: 400000, dtype: int64
------------------------------
인입횟수_ARS_R6M
1회 이상     2328933
10회 이상      71067
Name: count, dtype: int64
------------------------------
이용메뉴건수_ARS_R6M
1회 이상     2313787
10회 이상      70027
30회 이상      10172
20회 이상       6014
Name: count, dtype: int64
------------------------------
인입일수_ARS_R6M
0     1683964
1      266478
2      200257
3       74634
4       39222
5       26364
6       21823
8       20994
9       19334
7       19244
10      15987
12       9904
11       1795
Name: count, dtype: int64
------------------------------
인입월수_ARS_R6M
0    1696515
1     410652
2     208764
6      78256
3       4967
4        667
5        179
Name: count, dtype: int64
------------------------------
인입후경과월_ARS
0    1880190
5     128172
4     117176
3     104298


In [23]:
a1= all_df.iloc[:,2].value_counts()
print(a1.index[0], a1[0])

1회 이상 2328933


In [27]:
for i in range(1, 104) :
    a1= all_df.iloc[:,i].value_counts()
    print(a1.iloc[0])
    print('-'*30)

6
------------------------------
2328933
------------------------------
2313787
------------------------------
1683964
------------------------------
1696515
------------------------------
1880190
------------------------------
2251753
------------------------------
2250867
------------------------------
2251797
------------------------------
2221841
------------------------------
2236992
------------------------------
2064931
------------------------------
2092569
------------------------------
2082674
------------------------------
1850835
------------------------------
1840205
------------------------------
1865997
------------------------------
2349905
------------------------------
2349924
------------------------------
2347481
------------------------------
2370628
------------------------------
2230827
------------------------------
2231071
------------------------------
1970714
------------------------------
1971052
------------------------------
2399883
-----------------------

In [28]:
import pandas as pd
import csv

results = []

for i in range(1, all_df.shape[1]):
    col_name = all_df.columns[i]
    series = all_df.iloc[:, i]
    desc = series.describe()

    desc_str = f"[{col_name}]"

    # 숫자형 처리
    if pd.api.types.is_numeric_dtype(series):
        desc = desc.round().astype(int)
        zero_count = series.value_counts().get(0, 0)

        # 줄맞춰 출력 + 0의 개수 추가
        desc_text = "\n".join(f"{idx:<6} {val:>10}" for idx, val in desc.items())
        desc_str += f"\n{desc_text}\n{'0의 개수:':<6} {zero_count:>10}"
    else:
        desc_str += f"\n{series.describe().to_string()}"

    results.append(desc_str)

# CSV로 줄바꿈 포함 저장
df_result = pd.DataFrame({'통계 요약': results})
df_result.to_csv("describe_summary_with_zeros.csv", index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)
