# 1. Import - 환경설정

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pyarrow.parquet as pq
import tqdm
import optuna
import gc
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# test 데이터 준비 (샘플로 train에서 분리)
from sklearn.model_selection import train_test_split

# 모델 학습 및 평가
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score

* 최적화 관련 코드

In [2]:
# 메모리를 32비트로 변환
def optimize_memory(df):
    for col in df.select_dtypes(include=['number']).columns:
        # 정수형은 가능한 가장 작은 타입으로 (int8, int16 등)
        if df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
        # 실수형은 float32로
        elif df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# 2. 파일을 불러오고, 폴더별로 합치기

In [3]:
# 함수로 만들어서 넣어버리자 그냥
# 각 폴더명 딕셔너리
HOME = os.getcwd()
def makeFileList(option = "train"):
    folder_name = {1:"1.회원정보", 2:"2.신용정보", 3:"3.승인매출정보", 4:"4.청구입금정보", 5:"5.잔액정보", 6:"6.채널정보", 7:"7.마케팅정보", 8:"8.성과정보"}

    # 현재 디렉터리
    HOME = os.getcwd()

    # 폴더 리스트를 생성하고 폴더 경로들을 리스트로 저장하기
    folder_list = []
    for value in folder_name.values():
        folder_list.append(os.path.join(HOME, "data", option, value))
    print(len(folder_list))

    # 폴더 내 파일 리스트 읽어오기
    fileNameList = {}
    for i in range(len(folder_list)):
        fileNameList[i+1] = os.listdir(folder_list[i])
    print(fileNameList)

    # 파일 경로 리스트 생성
    filePathList = {}
    temp = []
    for i in range(len(folder_list)):
        for j in range(len(fileNameList[i+1])):
            temp.append(os.path.join(folder_list[i], fileNameList[i+1][j]))
        filePathList[i+1] = temp
        temp = []

    return filePathList

In [4]:
# 240만개의 파일을 이어 붙이기
# 각 폴더에 들어있는 파일들 중 같은 달의 열을 모두 합침
def load_cat(i, filePathList):
    print("현재 진행 상황 : {}".format(i))
    df = optimize_memory(pd.read_parquet(filePathList[i][0]))
    for j in range(1,6):
        print("Debugging1")
        df = pd.concat([df, optimize_memory(pd.read_parquet(filePathList[i][j]))])
    return df

# 공통된 쓸모없는 column 제거 함수=
def drop_useless(df):
    df = df.drop(columns = ["기준년월", "ID"])
    return df
# target인 Segment가 없을시 Segment 추가
def add_target(df, filePathList):# target이 존재하지 않는 df, target명(문자열)
    temp_df = pd.read_parquet(filePathList[1][0])
    for i in range(1, 6):
        temp_df = pd.concat([temp_df, pd.read_parquet(filePathList[1][i])])
    df = pd.merge(temp_df[["기준년월", "ID", "Segment"]], df, on = ["기준년월","ID"],how = "left")
    return df

# 파일을 불러오는 함수
def make_df(i, option = False):# option이 True면 Test데이터도 불러오게 하자.

    # train 데이터 불러오기
    train = makeFileList()
    test = makeFileList(option = "test")
    train = load_cat(i, train)

    # test 데이터 불러오기
    if option == "all":
        train = pd.DataFrame()
        for i in range(1, 9):
            train = pd.concat([train, drop_useless(load_cat(i, makeFileList()))], axis = 1)    
        test = pd.DataFrame()
        for i in range(1, 9):
            test = pd.concat([test, drop_useless(load_cat(i, makeFileList(option = "test")))], axis = 1)
    elif option:
        train = drop_useless(add_target(train, makeFileList()))
        test = drop_useless(load_cat(i, test))

    
    # option 이 True 라면 concat해서 붙여넣기 : if로 하지 않으면 앞에 test를 선언했기 때문에 오류 날 것
    if option:
        print(train.shape)
        print(test.shape)

        result = pd.concat([train, test], ignore_index=True)
        print(result.shape)
        return result
    else :
        print(train.shape)
        if (i > 1) & (i <= 6): train = add_target(train, makeFileList())
        elif i == 1: return drop_useless(train)
        else: print("숫자가 아니거나 할당 없는 카테고리입니다.")
        return drop_useless(train)
    # i 가 1보다 크고 1보다 작을때 사용



# 데이터프레임에서 파라미터를 입력하면 그 파라미터가 포함된 column들을 가져와 새로운 데이터프레임을 반환함
# 연관성 있는 column들을 얻어오기 위해서 사용한다.
def find_R(df, param1):
    columns = list(df.columns)
    new_columns = []
    for column in columns:
        if param1 in column:
            new_columns.append(column)
    return pd.DataFrame(df[new_columns])

## 사용하면 되는 함수
- make_df(i, option = False)
    - 데이터를 불러와 데이터 프레임을 만들어 내는 함수
    - i는 폴더를 불러오기 위한 변수
    - 기본적으로 option = False
    - option에 아무 값이나 넣으면 True가 되고, 그러면 해당하는 Test값이 들어간다.
    - option에 "all"이 들어가면 모든 데이터를 합쳐서 만들어줌

In [5]:
# 만들어둔 함수를 통해, 불러오고 싶은 폴더를 입력하면 데이터를 한번에 불러와준다.
df = make_df(2, option = "all")

8
{1: ['201807_train_회원정보.parquet', '201808_train_회원정보.parquet', '201809_train_회원정보.parquet', '201810_train_회원정보.parquet', '201811_train_회원정보.parquet', '201812_train_회원정보.parquet'], 2: ['201807_train_신용정보.parquet', '201808_train_신용정보.parquet', '201809_train_신용정보.parquet', '201810_train_신용정보.parquet', '201811_train_신용정보.parquet', '201812_train_신용정보.parquet'], 3: ['201807_train_승인매출정보.parquet', '201808_train_승인매출정보.parquet', '201809_train_승인매출정보.parquet', '201810_train_승인매출정보.parquet', '201811_train_승인매출정보.parquet', '201812_train_승인매출정보.parquet'], 4: ['201807_train_청구정보.parquet', '201808_train_청구정보.parquet', '201809_train_청구정보.parquet', '201810_train_청구정보.parquet', '201811_train_청구정보.parquet', '201812_train_청구정보.parquet'], 5: ['201807_train_잔액정보.parquet', '201808_train_잔액정보.parquet', '201809_train_잔액정보.parquet', '201810_train_잔액정보.parquet', '201811_train_잔액정보.parquet', '201812_train_잔액정보.parquet'], 6: ['201807_train_채널정보.parquet', '201808_train_채널정보.parquet', '201809_train_채널정보.parquet',

# 3. 전처리 - concat & merge

In [12]:
import pandas as pd
import numpy as np
import gc

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# train data
customer_train_df = pd.read_parquet("./data/train/customer_train_cleaned.parquet")
credit_train_df = pd.read_parquet("./data/train/credit_train_cleaned.parquet")
sales_train_df = pd.read_parquet("./data/train/sales_train_cleaned.parquet")
billing_train_df = pd.read_parquet("./data/train/billing_train_cleaned.parquet")
balance_train_df = pd.read_parquet("./data/train/balance_train_cleaned.parquet")
channel_train_df = pd.read_parquet("./data/train/channel_train_cleaned.parquet")
marketing_train_df = pd.read_parquet("./data/train/marketing_train_cleaned.parquet")
performance_train_df = pd.read_parquet("./data/train/performance_train_cleaned.parquet")

In [None]:
train_df = customer_train_df.merge(credit_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(sales_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(billing_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(balance_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(channel_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(marketing_train_df, on=['기준년월', 'ID'], how='left')
train_df = train_df.merge(performance_train_df, on=['기준년월', 'ID'], how='left')

In [None]:
for col in train_df.select_dtypes(include='int64').columns:
    if train_df[col].max() < 2_147_483_647:
        train_df[col] = train_df[col].astype('int32') # 메모리 줄이기 위해 int64 ->int32

In [None]:
train_df.to_parquet('./data/train/train_df_cleaned.parquet')

In [None]:
# test data
customer_test_df = pd.read_parquet("./data/test/customer_test_cleaned.parquet")
credit_test_df = pd.read_parquet("./data/test/credit_test_cleaned.parquet")
sales_test_df = pd.read_parquet("./data/test/sales_test_cleaned.parquet")
billing_test_df = pd.read_parquet("./data/test/billing_test_cleaned.parquet")
balance_test_df = pd.read_parquet("./data/test/balance_test_cleaned.parquet")
channel_test_df = pd.read_parquet("./data/test/channel_test_cleaned.parquet")
marketing_test_df = pd.read_parquet("./data/test/marketing_test_cleaned.parquet")
performance_test_df = pd.read_parquet("./data/test/performance_test_cleaned.parquet")

In [None]:
test_df = customer_test_df.merge(credit_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(sales_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(billing_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(balance_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(channel_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(marketing_test_df, on=['기준년월', 'ID'], how='left')
test_df = test_df.merge(performance_test_df, on=['기준년월', 'ID'], how='left')

for col in test_df.select_dtypes(include='int64').columns:
    if test_df[col].max() < 2_147_483_647:
        test_df[col] = test_df[col].astype('int32')

for col in test_df.select_dtypes(include='float64').columns:
    test_df[col] = test_df[col].astype('float32')

In [None]:
test_df.to_parquet('./data/test/test_df_cleaned.parquet')

# 4. Modeling(1) - feature importance

In [None]:
import pandas as pd
import numpy as np
import gc

# from google.colab import drive
# drive.mount('/content/drive')

import sklearn
from sklearn.utils.class_weight import compute_class_weight
import imblearn
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
train_df = pd.read_parquet('./data/train/train_df_cleaned.parquet')

feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()
y = y.map({'A':0, 'B':1,'C':2,'D':3,'E':4})

del train_df
gc.collect()

# 클래스 weight 계산
classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))

# 각 샘플에 대해 weight 매핑
w_train = pd.Series(y).map(class_weights)

# 전체 feature로 XGBoost 학습 (변수 중요도 추출용)
temp_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    n_estimators=700,
    tree_method='hist',
    device='cuda',
    random_state=42
    )

temp_model.fit(X, y, sample_weight = w_train, verbose=False)

In [None]:
# XGBoost 기준 중요도 상위 300개 변수 추출
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': temp_model.feature_importances_
}).sort_values(by='importance', ascending=False)

top300_features = importance_df.head(300)['feature'].tolist()

print(top300_features)

top300_df = pd.DataFrame({'feature': top300_features})
top300_df.to_csv(
    "./data/top300_features_XGB_balanced.csv", # 저장 경로(본인꺼)
    index=False,
    encoding="utf-8-sig"
)

# 5. 표준화

In [None]:
# scaler = StandardScaler()
# scaler.fit(df)
df.loc[df["Segment"].notna()].to_parquet("train_data.parquet")
df.loc[df["Segment"].isnull()].to_parquet("test_data.parquet")

# 6. Modeling(2) - final model train

## XGBoost 모델 학습

In [None]:
import pandas as pd
import numpy as np
import gc
import os

# [수정] 구글 드라이브 마운트 코드 삭제 (로컬에서는 불필요)
# from google.colab import drive
# drive.mount('/content/drive')

import sklearn
from sklearn.utils.class_weight import compute_class_weight
import imblearn
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import XGBClassifier

# 현재 버전 및 준비 상태 확인
print(f"XGBoost version: {xgb.__version__}")
print("환경 준비 완료! 이제 데이터를 로드하세요.")

In [None]:
train_df = pd.read_parquet('./data/train/train_df_cleaned.parquet') # 본인 경로에 맞게 수정

feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()
y = y.map({'A':0, 'B':1,'C':2,'D':3,'E':4})
inverse_label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}

# 변수 300개 사용
top300_df = pd.read_csv("./data/top300_features_XGB_balanced.csv") # 본인 경로에 맞게 수정
top300_features = top300_df['feature'].tolist()
X_top300 = X[top300_features]

# 오버샘플링
smote = SMOTE(sampling_strategy={0: 30000, 1: 30000, 2: 250000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_top300, y)

# 클래스별 weight 계산
classes = np.unique(y_resampled)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_resampled)
class_weights = dict(zip(classes, weights))
sample_weights = pd.Series(y_resampled).map(class_weights)

for cls in sorted(class_weights):
    print(f"클래스 {cls}: weight = {class_weights[cls]:.2f}")

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    n_estimators=5000,
    tree_method='hist',
    device='cuda',
    random_state=42
    )

# 모델 학습 (검증 없이 전체 데이터 사용)
xgb_model.fit(
    X_resampled, y_resampled,
    sample_weight=sample_weights,
    verbose=False
)

In [None]:
xgb_model.save_model('./data/softvoting_xgb_xgb변수_3_3_25.json')

## LightGBM 모델 학습

In [None]:
import pandas as pd
import numpy as np
import gc
import sklearn
from sklearn.utils.class_weight import compute_class_weight
import imblearn
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, LGBMClassifier

# 버전 확인 (이미 설치한 GPU 버전이 잘 나오는지 확인)
print(f"LightGBM 버전: {lgb.__version__}")

# from google.colab import drive
# drive.mount('/content/drive')

# # LightGBM을 위한 GPU driver 설치 코드
# !mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
# !sudo apt install nvidia-driver-460 nvidia-cuda-toolkit clinfo
# !apt-get update --fix-missing
# !pip install -q  lightgbm==4.1.0 \
#   --config-settings=cmake.define.USE_GPU=ON \
#   --config-settings=cmake.define.OpenCL_INCLUDE_DIR="/usr/local/cuda/include/" \
#   --config-settings=cmake.define.OpenCL_LIBRARY="/usr/local/cuda/lib64/libOpenCL.so"

In [None]:
train_df = pd.read_parquet('./data/train/train_df_cleaned.parquet')

feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()
y = y.map({'A':0, 'B':1,'C':2,'D':3,'E':4})
inverse_label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}

# 변수 300개 사용
top300_df = pd.read_csv("./data/top300_features_XGB_balanced.csv")
top300_features = top300_df['feature'].tolist()
X_top300 = X[top300_features]

# 오버샘플링
smote = SMOTE(sampling_strategy={0: 30000, 1: 30000, 2: 250000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_top300, y)

# 클래스별 weight 계산
classes = np.unique(y_resampled)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_resampled)
class_weights = dict(zip(classes, weights))
sample_weights = pd.Series(y_resampled).map(class_weights)

for cls in sorted(class_weights):
    print(f"클래스 {cls}: weight = {class_weights[cls]:.2f}")

In [None]:
lgb_model = LGBMClassifier(
    objective='multiclass',
    boosting_type = 'gbdt',
    metric = 'multi_logloss',
    verbosity = -1,
    random_state = 42,
    device = 'gpu',
    num_class = 5,
    gpu_platform_id = 0,
    gpu_device_id = 0,

    num_boost_round = 3348, # n_estimators
    learning_rate = 0.16304239034369372,
    num_leaves = 260,
    max_depth = 6,
    min_data_in_leaf = 1200,
    bagging_fraction = 0.8999999999999999,
    bagging_freq = 4,
    feature_fraction = 0.8,
    lambda_l1 = 4.4610495762623494e-05,
    lambda_l2 = 0.6898684039866835,
    min_gain_to_split = 0.3944497642389666
    )

# 모델 학습 (검증 없이 전체 데이터 사용)
lgb_model.fit(
    X_resampled, y_resampled,
    sample_weight=sample_weights,
    callbacks=[
        log_evaluation(period=0)  # ✅ verbose_eval=False랑 같음!
        ]
)

In [None]:
lgb_model.booster_.save_model('./data/softvoting_lgbm_model_3_3_25.txt')
print("모델 저장 완료!")

## CatBoost 모델 학습

In [None]:
import pandas as pd
import numpy as np
import gc

# from google.colab import drive
# drive.mount('/content/drive')

# # catboost 설치
# !pip install catboost

import sklearn
from sklearn.utils.class_weight import compute_class_weight
import imblearn
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import XGBClassifier
import catboost
from catboost import CatBoostClassifier

print(f"CatBoost 버전: {catboost.__version__}")
print("환경 준비 완료!")

In [None]:
train_df = pd.read_parquet('./data/train/train_df_cleaned.parquet')

feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()
y = y.map({'A':0, 'B':1,'C':2,'D':3,'E':4})
inverse_label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}

# 변수 300개 사용
top300_df = pd.read_csv("./data/top300_features_XGB_balanced.csv")
top300_features = top300_df['feature'].tolist()
X_top300 = X[top300_features]

# 오버샘플링
smote = SMOTE(sampling_strategy={0: 60000, 1: 120000, 2: 240000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_top300, y)

# 클래스별 weight 계산
classes = np.unique(y_resampled)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_resampled)
class_weights = dict(zip(classes, weights))
sample_weights = pd.Series(y_resampled).map(class_weights)

for cls in sorted(class_weights):
    print(f"클래스 {cls}: weight = {class_weights[cls]:.2f}")

In [None]:
cat_model = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    random_state=42,
    class_weights=weights.tolist(),
    iterations = 5000
)

# 모델 학습
cat_model.fit(X_resampled, y_resampled)

In [None]:
cat_model.save_model('./data/softvoting_cat_xgb변수_6_12_24.cbm')

# 7. 모델예측 - Soft voting

In [None]:
test_df = pd.read_parquet("./data/test/test_df_cleaned.parquet")

# 변수 300개 사용
top300_df = pd.read_csv("./data/top300_features_XGB_balanced.csv")
top300_features = top300_df['feature'].tolist()

In [None]:
# 학습한 모델들 불러오기
xgb_model_loaded = xgb.XGBClassifier()
xgb_model_loaded.load_model('./data/softvoting_xgb_xgb변수_3_3_25.json')

lgb_model_loaded = lgb.Booster(model_file='./data/softvoting_lgbm_model_3_3_25.txt')

cat_model_loaded = CatBoostClassifier()
cat_model_loaded.load_model('./data/softvoting_cat_xgb변수_6_12_24.cbm')

In [None]:
# test 데이터 준비
X_test = test_df[top300_features]

# XGB predict_proba (iteration_range)
proba_xgb = xgb_model_loaded.predict_proba(X_test, iteration_range=(0, 5000))

# LGB Booster (predict가 proba)
proba_lgb = lgb_model_loaded.predict(X_test, num_iteration=3348)

# CAT predict_proba
proba_cat = cat_model_loaded.predict_proba(X_test)

# soft voting
ensemble_proba = (0.4 * proba_xgb) + (0.3 * proba_lgb) + (0.3 * proba_cat)
ensemble_preds = np.argmax(ensemble_proba, axis=1)
inverse_label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
ensemble_preds_label = pd.Series(ensemble_preds).map(inverse_label_map)