<a href="https://colab.research.google.com/github/DSHYOJIN/my-first-repository/blob/master/modu_ds6_test_notebook_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 신용카드 사기 탐지 분류 문제

이 노트북의 다음 라이브러리 버전은 "파이썬 머신러닝 완벽 가이드 개정2판"을 기준으로 설정되었습니다.

### 라이브러리 버전
2. `xgboost` : 1.5.0
3. `lightgbm` : 3.3.2

### 기타 사용 라이브러리
1. `hyperopt`

```python
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
```

```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# 훈련 데이터와 테스트 데이터는 미리 준비되어 있습니다.

train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
```

```python
train_df.info()
```

```python
test_df.info()
```

```python
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)
```

```python
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]
```

```python
train_df.dropna(subset=['Class'], inplace=True)
print("Rows with NaN in 'Class' column removed from train_df.")
```

```python
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]

print("X_features and y_target have been updated with cleaned data.")
```

```python
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, y_target, test_size=0.2, random_state=42, stratify=y_target)

print("Data successfully split into training and validation sets.")
```

```python
print('학습 데이터 간 레이블 값 비율')
print(y_train.value_counts()/y_train.shape[0] * 100)
print('검증 데이터 간 레이블 값 비율')
print(y_val.value_counts()/y_val.shape[0] * 100)
```

```python
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform 'Amount' column in X_train
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])

# Transform 'Amount' column in X_val using the scaler fitted on X_train
X_val['Amount'] = scaler.transform(X_val[['Amount']])

print("StandardScaler applied to 'Amount' feature in X_train and X_val.")
```

```python
from sklearn.linear_model import LogisticRegression

# 학습
lr_clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_clf.fit(X_train, y_train)

# 검증
lr_val = lr_clf.predict(X_val)
lr_val_proba = lr_clf.predict_proba(X_val)[:,1]

print('Logistic Regression model training and prediction on validation set completed.')
```

```python
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

lr_val_confusion = confusion_matrix(y_val, lr_val)
lr_val_accuracy = accuracy_score(y_val, lr_val)
lr_val_precision = precision_score(y_val, lr_val)
lr_val_recall = recall_score(y_val, lr_val)
lr_val_f1 = f1_score(y_val, lr_val)

lr_val_roc_score = roc_auc_score(y_val, lr_val_proba)

print("검증 데이터 혼동행렬:")
print(lr_val_confusion)
print("검증 데이터 정확도: ",lr_val_accuracy)
print("검증 데이터 정밀도: ",lr_val_precision)
print("검증 데이터 재현율: ",lr_val_recall)
print("검증 데이터 F1 점수: ",lr_val_f1)
print("검증 데이터 AUC: ",lr_val_roc_score)
```

```python
from lightgbm import LGBMClassifier

# Calculate scale_pos_weight
neg_count = y_train.value_counts()[0.0]
pos_count = y_train.value_counts()[1.0]
scale_pos_weight_value = neg_count / pos_count

# Initialize and train LightGBM Classifier
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False,
                          scale_pos_weight=scale_pos_weight_value, random_state=42)
lgbm_clf.fit(X_train, y_train)

# Predict on the validation set
lgbm_val = lgbm_clf.predict(X_val)
lgbm_val_proba = lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier training and prediction on validation set completed.")
```

```python
lgbm_val_confusion = confusion_matrix(y_val, lgbm_val)
lgbm_val_accuracy = accuracy_score(y_val, lgbm_val)
lgbm_val_precision = precision_score(y_val, lgbm_val)
lgbm_val_recall = recall_score(y_val, lgbm_val)
lgbm_val_f1 = f1_score(y_val, lgbm_val)
lgbm_val_roc_score = roc_auc_score(y_val, lgbm_val_proba)

print("LightGBM 검증 데이터 혼동행렬:")
print(lgbm_val_confusion)
print("LightGBM 검증 데이터 정확도: ",lgbm_val_accuracy)
print("LightGBM 검증 데이터 정밀도: ",lgbm_val_precision)
print("LightGBM 검증 데이터 재현율: ",lgbm_val_recall)
print("LightGBM 검증 데이터 F1 점수: ",lgbm_val_f1)
print("LightGBM 검증 데이터 AUC: ",lgbm_val_roc_score)
```

```python
!pip install pandas>=2.2.0 xgboost>=2.1.0 scikit-learn>=1.5.0
```

```python
from xgboost import XGBClassifier

# Calculate scale_pos_weight (already calculated for LightGBM, can reuse)
# neg_count = y_train.value_counts()[0.0]
# pos_count = y_train.value_counts()[1.0]
# scale_pos_weight_value = neg_count / pos_count

# Initialize and train XGBoost Classifier
xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth=5,
                        use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=scale_pos_weight_value, random_state=42, n_jobs=-1)
xgb_clf.fit(X_train.values, y_train.values)

# Predict on the validation set
xgb_val = xgb_clf.predict(X_val.values)
xgb_val_proba = xgb_clf.predict_proba(X_val.values)[:, 1]

print("XGBoost Classifier training and prediction on validation set completed.")
```

```python
xgb_val_confusion = confusion_matrix(y_val, xgb_val)
xgb_val_accuracy = accuracy_score(y_val, xgb_val)
xgb_val_precision = precision_score(y_val, xgb_val)
xgb_val_recall = recall_score(y_val, xgb_val)
xgb_val_f1 = f1_score(y_val, xgb_val)
xgb_val_roc_score = roc_auc_score(y_val, xgb_val_proba)

print("XGBoost 검증 데이터 혼동행렬:")
print(xgb_val_confusion)
print("XGBoost 검증 데이터 정확도: ", xgb_val_accuracy)
print("XGBoost 검증 데이터 정밀도: ", xgb_val_precision)
print("XGBoost 검증 데이터 재현율: ", xgb_val_recall)
print("XGBoost 검증 데이터 F1 점수: ", xgb_val_f1)
print("XGBoost 검증 데이터 AUC: ", xgb_val_roc_score)
```

```python
from hyperopt import hp, fmin, tpe, Trials

# Define the objective function for hyperopt
def objective_lgbm(params):
    model = LGBMClassifier(
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate'],
        num_leaves=int(params['num_leaves']),
        max_depth=int(params['max_depth']),
        scale_pos_weight=scale_pos_weight_value,
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    # Hyperopt minimizes the objective, so return negative f1-score
    return -f1

# Define the search space for LightGBM hyperparameters
space_lgbm = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'max_depth': hp.quniform('max_depth', 5, 20, 1),
}

# Run hyperparameter tuning
trials_lgbm = Trials()
best_lgbm = fmin(
    fn=objective_lgbm,
    space=space_lgbm,
    algo=tpe.suggest,
    max_evals=50, # Number of evaluations
    trials=trials_lgbm,
    rstate=np.random.default_rng(42) # For reproducible results
)

print("Hyperparameter tuning for LightGBM completed.")
print("Best LightGBM hyperparameters found:", best_lgbm)
```

```python
from lightgbm import LGBMClassifier

# Extract best hyperparameters from best_lgbm dictionary
best_n_estimators = int(best_lgbm['n_estimators'])
best_learning_rate = best_lgbm['learning_rate']
best_num_leaves = int(best_lgbm['num_leaves'])
best_max_depth = int(best_lgbm['max_depth'])

# Initialize and train LightGBM Classifier with best hyperparameters
tuned_lgbm_clf = LGBMClassifier(
    n_estimators=best_n_estimators,
    learning_rate=best_learning_rate,
    num_leaves=best_num_leaves,
    max_depth=best_max_depth,
    scale_pos_weight=scale_pos_weight_value, # Reuse previously calculated value
    n_jobs=-1,
    boost_from_average=False,
    random_state=42
)
tuned_lgbm_clf.fit(X_train, y_train)

# Predict on the validation set using the tuned model
tuned_lgbm_val = tuned_lgbm_clf.predict(X_val)
tuned_lgbm_val_proba = tuned_lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier trained with best hyperparameters and predictions made on validation set.")
```

```python
tuned_lgbm_val_confusion = confusion_matrix(y_val, tuned_lgbm_val)
tuned_lgbm_val_accuracy = accuracy_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_precision = precision_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_recall = recall_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_f1 = f1_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_roc_score = roc_auc_score(y_val, tuned_lgbm_val_proba)

print("Tuned LightGBM 검증 데이터 혼동행렬:")
print(tuned_lgbm_val_confusion)
print("Tuned LightGBM 검증 데이터 정확도: ",tuned_lgbm_val_accuracy)
print("Tuned LightGBM 검증 데이터 정밀도: ",tuned_lgbm_val_precision)
print("Tuned LightGBM 검증 데이터 재현율: ",tuned_lgbm_val_recall)
print("Tuned LightGBM 검증 데이터 F1 점수: ",tuned_lgbm_val_f1)
print("Tuned LightGBM 검증 데이터 AUC: ",tuned_lgbm_val_roc_score)
```

```python
test_copy = test_df.copy()
test_copy.drop('Time', axis=1, inplace=True)
test_copy['Amount'] = scaler.transform(test_copy[['Amount']])

print("'Time' column removed and 'Amount' column scaled in test_copy.")
```

```python
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)
train_copy.drop('id', axis=1, inplace=True) # Drop 'id' from training features
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]

print("X_features and y_target have been updated with cleaned data.")
```

```python
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, y_target, test_size=0.2, random_state=42, stratify=y_target)

print("Data successfully split into training and validation sets.")
```

```python
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])
X_val['Amount'] = scaler.transform(X_val[['Amount']])

print("StandardScaler reapplied to 'Amount' feature in X_train and X_val.")
```

```python
from lightgbm import LGBMClassifier

# Extract best hyperparameters from best_lgbm dictionary
best_n_estimators = int(best_lgbm['n_estimators'])
best_learning_rate = best_lgbm['learning_rate']
best_num_leaves = int(best_lgbm['num_leaves'])
best_max_depth = int(best_lgbm['max_depth'])

# Initialize and train LightGBM Classifier with best hyperparameters
tuned_lgbm_clf = LGBMClassifier(
    n_estimators=best_n_estimators,
    learning_rate=best_learning_rate,
    num_leaves=best_num_leaves,
    max_depth=best_max_depth,
    scale_pos_weight=scale_pos_weight_value, # Reuse previously calculated value
    n_jobs=-1,
    boost_from_average=False,
    random_state=42
)
tuned_lgbm_clf.fit(X_train, y_train)

# Predict on the validation set using the tuned model
tuned_lgbm_val = tuned_lgbm_clf.predict(X_val)
tuned_lgbm_val_proba = tuned_lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier re-trained with best hyperparameters and predictions made on validation set.")
```

```python
test_predictions_proba = tuned_lgbm_clf.predict_proba(test_copy.drop('id', axis=1))[:, 1]
test_predictions_class = (test_predictions_proba > 0.5).astype(int)

print("Predicted probabilities and class labels generated for the test data.")
```

```python
submission_df = pd.DataFrame({'id': test_df['id'], 'Class': test_predictions_class})
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")
```

In [1]:
!pip uninstall -y xgboost lightgbm

Found existing installation: xgboost 3.1.1
Uninstalling xgboost-3.1.1:
  Successfully uninstalled xgboost-3.1.1
Found existing installation: lightgbm 4.6.0
Uninstalling lightgbm-4.6.0:
  Successfully uninstalled lightgbm-4.6.0


In [63]:
!pip install xgboost==1.5.0 lightgbm==3.3.2 numpy==1.26.4 scikit-learn==1.3.0

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn==1.3.0
  Using cached scikit_learn-1.3.0-cp312-cp312-linux_x86_64.whl
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scikit-learn
  Attempting uninstall: numpy
    Found existing installation: numpy 2.3.4
    Uninstalling numpy-2.3.4:
      Successfully uninstalled numpy-2.3.4
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently t

In [3]:
!pip install hyperopt



In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# 훈련 데이터와 테스트 데이터는 미리 준비되어 있습니다.

train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# 데이터셋 정보

2013년 9월 유럽 신용카드 사용자들의 거래 내역을 포함하고 있습니다.
이 데이터셋은 이틀 간의 거래 내역으로 데이터셋은 크게 불균형하며, 사기 거래(양성 클래스)는 전체 거래의 약 0.17%에 불과합니다.

이 데이터셋은 PCA 변환을 거친 수치형 변수들만 포함하고 있습니다. 안타깝게도 기밀 유지 문제로 인해 원본 특징과 데이터에 대한 자세한 배경 정보는 제공할 수 없습니다. V1, V2, …, V28은 PCA로 얻은 주성분이며, PCA 변환을 거치지 않은 유일한 특징은 'Time'과 'Amount'입니다. 'Time'은 각 거래와 데이터셋의 첫 거래 사이의 경과 시간(초)을 나타내며, 'Amount'는 거래 금액입니다. 'Amount' 특징은 예시 의존적 비용 민감 학습 등에 사용될 수 있습니다. 'Class'는 응답 변수로, 사기 거래인 경우 1, 그렇지 않은 경우 0의 값을 가집니다.

클래스 불균형 비율을 고려하여, 정밀도-재현율 곡선 아래 영역(AUPRC)을 사용하여 정확도를 측정할 것을 권장합니다. 불균형 분류에서는 혼동 행렬 정확도가 큰 의미가 없습니다.

이 데이터셋은 Worldline과 ULB(브뤼셀 자유 대학교) 머신러닝 그룹(http://mlg.ulb.ac.be) 간의 빅데이터 마이닝 및 사기 탐지 관련 연구 협력을 통해 수집 및 분석되었습니다.
관련 주제에 대한 현재 및 과거 프로젝트에 대한 자세한 내용은 https://www.researchgate.net/project/Fraud-detection-5 및 DefeatFraud 프로젝트 페이지에서 확인할 수 있습니다.

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115645 entries, 0 to 115644
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      115645 non-null  int64  
 1   Time    115645 non-null  float64
 2   V1      115645 non-null  float64
 3   V2      115645 non-null  float64
 4   V3      115645 non-null  float64
 5   V4      115645 non-null  float64
 6   V5      115645 non-null  float64
 7   V6      115645 non-null  float64
 8   V7      115645 non-null  float64
 9   V8      115645 non-null  float64
 10  V9      115645 non-null  float64
 11  V10     115645 non-null  float64
 12  V11     115645 non-null  float64
 13  V12     115645 non-null  float64
 14  V13     115645 non-null  float64
 15  V14     115645 non-null  float64
 16  V15     115645 non-null  float64
 17  V16     115645 non-null  float64
 18  V17     115645 non-null  float64
 19  V18     115645 non-null  float64
 20  V19     115645 non-null  float64
 21  V20     11

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113923 entries, 0 to 113922
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      113923 non-null  int64  
 1   Time    113923 non-null  float64
 2   V1      113923 non-null  float64
 3   V2      113923 non-null  float64
 4   V3      113923 non-null  float64
 5   V4      113923 non-null  float64
 6   V5      113923 non-null  float64
 7   V6      113923 non-null  float64
 8   V7      113923 non-null  float64
 9   V8      113923 non-null  float64
 10  V9      113923 non-null  float64
 11  V10     113923 non-null  float64
 12  V11     113923 non-null  float64
 13  V12     113923 non-null  float64
 14  V13     113923 non-null  float64
 15  V14     113923 non-null  float64
 16  V15     113923 non-null  float64
 17  V16     113923 non-null  float64
 18  V17     113923 non-null  float64
 19  V18     113923 non-null  float64
 20  V19     113923 non-null  float64
 21  V20     11

# 뼈대 만들기

머신러닝 파이프라인은

데이터 준비 → 모델 정의 → 학습 → 예측 → 평가의 형태를 띕니다.

최소한의 형태로 파이프라인만 구성해 보겠습니다.

In [8]:
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)

In [9]:
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]

In [12]:
train_df.dropna(subset=['Class'], inplace=True)
print("Rows with NaN in 'Class' column removed from train_df.")

Rows with NaN in 'Class' column removed from train_df.


In [13]:
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]

print("X_features and y_target have been updated with cleaned data.")

X_features and y_target have been updated with cleaned data.


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, y_target, test_size=0.2, random_state=42, stratify=y_target)

print("Data successfully split into training and validation sets.")

Data successfully split into training and validation sets.


In [15]:
print('학습 데이터 간 레이블 값 비율')
print(y_train.value_counts()/y_train.shape[0] * 100)
print('검증 데이터 간 레이블 값 비율')
print(y_val.value_counts()/y_val.shape[0] * 100)

학습 데이터 간 레이블 값 비율
Class
0.0    99.790304
1.0     0.209696
Name: count, dtype: float64
검증 데이터 간 레이블 값 비율
Class
0.0    99.792468
1.0     0.207532
Name: count, dtype: float64


In [16]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform 'Amount' column in X_train
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])

# Transform 'Amount' column in X_val using the scaler fitted on X_train
X_val['Amount'] = scaler.transform(X_val[['Amount']])

print("StandardScaler applied to 'Amount' feature in X_train and X_val.")

StandardScaler applied to 'Amount' feature in X_train and X_val.


# lr

In [17]:
from sklearn.linear_model import LogisticRegression

# 학습
lr_clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_clf.fit(X_train, y_train)

# 검증
lr_val = lr_clf.predict(X_val)
lr_val_proba = lr_clf.predict_proba(X_val)[:,1]

print('Logistic Regression model training and prediction on validation set completed.')

Logistic Regression model training and prediction on validation set completed.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

lr_val_confusion = confusion_matrix(y_val, lr_val)
lr_val_accuracy = accuracy_score(y_val, lr_val)
lr_val_precision = precision_score(y_val, lr_val)
lr_val_recall = recall_score(y_val, lr_val)
lr_val_f1 = f1_score(y_val, lr_val)

lr_val_roc_score = roc_auc_score(y_val, lr_val_proba)

print("검증 데이터 혼동행렬:")
print(lr_val_confusion)
print("검증 데이터 정확도: ",lr_val_accuracy)
print("검증 데이터 정밀도: ",lr_val_precision)
print("검증 데이터 재현율: ",lr_val_recall)
print("검증 데이터 F1 점수: ",lr_val_f1)
print("검증 데이터 AUC: ",lr_val_roc_score)


검증 데이터 혼동행렬:
[[22650   431]
 [    6    42]]
검증 데이터 정확도:  0.9811059708590947
검증 데이터 정밀도:  0.08879492600422834
검증 데이터 재현율:  0.875
검증 데이터 F1 점수:  0.16122840690978887
검증 데이터 AUC:  0.9865735525612698


# LGBM

In [19]:
from lightgbm import LGBMClassifier

# Calculate scale_pos_weight
neg_count = y_train.value_counts()[0.0]
pos_count = y_train.value_counts()[1.0]
scale_pos_weight_value = neg_count / pos_count

# Initialize and train LightGBM Classifier
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False,
                          scale_pos_weight=scale_pos_weight_value, random_state=42)
lgbm_clf.fit(X_train, y_train)

# Predict on the validation set
lgbm_val = lgbm_clf.predict(X_val)
lgbm_val_proba = lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier training and prediction on validation set completed.")

LightGBM Classifier training and prediction on validation set completed.


In [20]:
lgbm_val_confusion = confusion_matrix(y_val, lgbm_val)
lgbm_val_accuracy = accuracy_score(y_val, lgbm_val)
lgbm_val_precision = precision_score(y_val, lgbm_val)
lgbm_val_recall = recall_score(y_val, lgbm_val)
lgbm_val_f1 = f1_score(y_val, lgbm_val)
lgbm_val_roc_score = roc_auc_score(y_val, lgbm_val_proba)

print("LightGBM 검증 데이터 혼동행렬:")
print(lgbm_val_confusion)
print("LightGBM 검증 데이터 정확도: ",lgbm_val_accuracy)
print("LightGBM 검증 데이터 정밀도: ",lgbm_val_precision)
print("LightGBM 검증 데이터 재현율: ",lgbm_val_recall)
print("LightGBM 검증 데이터 F1 점수: ",lgbm_val_f1)
print("LightGBM 검증 데이터 AUC: ",lgbm_val_roc_score)

LightGBM 검증 데이터 혼동행렬:
[[23081     0]
 [   11    37]]
LightGBM 검증 데이터 정확도:  0.9995244065891306
LightGBM 검증 데이터 정밀도:  1.0
LightGBM 검증 데이터 재현율:  0.7708333333333334
LightGBM 검증 데이터 F1 점수:  0.8705882352941177
LightGBM 검증 데이터 AUC:  0.9809348959461607


# XGBoost

In [1]:
!pip install pandas>=2.2.0 xgboost>=2.1.0 scikit-learn>=1.5.0

In [21]:
from xgboost import XGBClassifier

# Calculate scale_pos_weight (already calculated for LightGBM, can reuse)
# neg_count = y_train.value_counts()[0.0]
# pos_count = y_train.value_counts()[1.0]
# scale_pos_weight_value = neg_count / pos_count

# Initialize and train XGBoost Classifier
xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth=5,
                        use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=scale_pos_weight_value, random_state=42, n_jobs=-1)
xgb_clf.fit(X_train, y_train)

# Predict on the validation set
xgb_val = xgb_clf.predict(X_val)
xgb_val_proba = xgb_clf.predict_proba(X_val)[:, 1]

print("XGBoost Classifier training and prediction on validation set completed.")



AttributeError: module 'pandas' has no attribute 'Int64Index'

In [22]:
from xgboost import XGBClassifier

# Calculate scale_pos_weight (already calculated for LightGBM, can reuse)
# neg_count = y_train.value_counts()[0.0]
# pos_count = y_train.value_counts()[1.0]
# scale_pos_weight_value = neg_count / pos_count

# Initialize and train XGBoost Classifier
xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth=5,
                        use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=scale_pos_weight_value, random_state=42, n_jobs=-1)
xgb_clf.fit(X_train.values, y_train.values)

# Predict on the validation set
xgb_val = xgb_clf.predict(X_val.values)
xgb_val_proba = xgb_clf.predict_proba(X_val.values)[:, 1]

print("XGBoost Classifier training and prediction on validation set completed.")

XGBoost Classifier training and prediction on validation set completed.


In [23]:
xgb_val_confusion = confusion_matrix(y_val, xgb_val)
xgb_val_accuracy = accuracy_score(y_val, xgb_val)
xgb_val_precision = precision_score(y_val, xgb_val)
xgb_val_recall = recall_score(y_val, xgb_val)
xgb_val_f1 = f1_score(y_val, xgb_val)
xgb_val_roc_score = roc_auc_score(y_val, xgb_val_proba)

print("XGBoost 검증 데이터 혼동행렬:")
print(xgb_val_confusion)
print("XGBoost 검증 데이터 정확도: ", xgb_val_accuracy)
print("XGBoost 검증 데이터 정밀도: ", xgb_val_precision)
print("XGBoost 검증 데이터 재현율: ", xgb_val_recall)
print("XGBoost 검증 데이터 F1 점수: ", xgb_val_f1)
print("XGBoost 검증 데이터 AUC: ", xgb_val_roc_score)

XGBoost 검증 데이터 혼동행렬:
[[23078     3]
 [   13    35]]
XGBoost 검증 데이터 정확도:  0.999308227766008
XGBoost 검증 데이터 정밀도:  0.9210526315789473
XGBoost 검증 데이터 재현율:  0.7291666666666666
XGBoost 검증 데이터 F1 점수:  0.813953488372093
XGBoost 검증 데이터 AUC:  0.9791982583076988


# Hyperparameter

In [24]:
from hyperopt import hp, fmin, tpe, Trials

# Define the objective function for hyperopt
def objective_lgbm(params):
    model = LGBMClassifier(
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate'],
        num_leaves=int(params['num_leaves']),
        max_depth=int(params['max_depth']),
        scale_pos_weight=scale_pos_weight_value,
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    # Hyperopt minimizes the objective, so return negative f1-score
    return -f1

# Define the search space for LightGBM hyperparameters
space_lgbm = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'max_depth': hp.quniform('max_depth', 5, 20, 1),
}

# Run hyperparameter tuning
trials_lgbm = Trials()
best_lgbm = fmin(
    fn=objective_lgbm,
    space=space_lgbm,
    algo=tpe.suggest,
    max_evals=50, # Number of evaluations
    trials=trials_lgbm,
    rstate=np.random.default_rng(42) # For reproducible results
)

print("Hyperparameter tuning for LightGBM completed.")
print("Best LightGBM hyperparameters found:", best_lgbm)

100%|██████████| 50/50 [04:13<00:00,  5.08s/trial, best loss: -0.7326732673267327]
Hyperparameter tuning for LightGBM completed.
Best LightGBM hyperparameters found: {'learning_rate': np.float64(0.4358068115219468), 'max_depth': np.float64(12.0), 'n_estimators': np.float64(250.0), 'num_leaves': np.float64(118.0)}


In [25]:
from lightgbm import LGBMClassifier

# Extract best hyperparameters from best_lgbm dictionary
best_n_estimators = int(best_lgbm['n_estimators'])
best_learning_rate = best_lgbm['learning_rate']
best_num_leaves = int(best_lgbm['num_leaves'])
best_max_depth = int(best_lgbm['max_depth'])

# Initialize and train LightGBM Classifier with best hyperparameters
tuned_lgbm_clf = LGBMClassifier(
    n_estimators=best_n_estimators,
    learning_rate=best_learning_rate,
    num_leaves=best_num_leaves,
    max_depth=best_max_depth,
    scale_pos_weight=scale_pos_weight_value, # Reuse previously calculated value
    n_jobs=-1,
    boost_from_average=False,
    random_state=42
)
tuned_lgbm_clf.fit(X_train, y_train)

# Predict on the validation set using the tuned model
tuned_lgbm_val = tuned_lgbm_clf.predict(X_val)
tuned_lgbm_val_proba = tuned_lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier trained with best hyperparameters and predictions made on validation set.")

LightGBM Classifier trained with best hyperparameters and predictions made on validation set.


In [26]:
tuned_lgbm_val_confusion = confusion_matrix(y_val, tuned_lgbm_val)
tuned_lgbm_val_accuracy = accuracy_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_precision = precision_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_recall = recall_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_f1 = f1_score(y_val, tuned_lgbm_val)
tuned_lgbm_val_roc_score = roc_auc_score(y_val, tuned_lgbm_val_proba)

print("Tuned LightGBM 검증 데이터 혼동행렬:")
print(tuned_lgbm_val_confusion)
print("Tuned LightGBM 검증 데이터 정확도: ",tuned_lgbm_val_accuracy)
print("Tuned LightGBM 검증 데이터 정밀도: ",tuned_lgbm_val_precision)
print("Tuned LightGBM 검증 데이터 재현율: ",tuned_lgbm_val_recall)
print("Tuned LightGBM 검증 데이터 F1 점수: ",tuned_lgbm_val_f1)
print("Tuned LightGBM 검증 데이터 AUC: ",tuned_lgbm_val_roc_score)

Tuned LightGBM 검증 데이터 혼동행렬:
[[23081     0]
 [   12    36]]
Tuned LightGBM 검증 데이터 정확도:  0.999481170824506
Tuned LightGBM 검증 데이터 정밀도:  1.0
Tuned LightGBM 검증 데이터 재현율:  0.75
Tuned LightGBM 검증 데이터 F1 점수:  0.8571428571428571
Tuned LightGBM 검증 데이터 AUC:  0.9816046387360455


## 테스트 데이터 전처리

### Subtask:
`test_df`에서 'Time' 컬럼을 제거하고, 훈련 데이터에 사용했던 `StandardScaler`를 사용하여 'Amount' 컬럼을 스케일링합니다.


In [55]:
test_copy = test_df.copy()
test_copy.drop('Time', axis=1, inplace=True)
test_copy['Amount'] = scaler.transform(test_copy[['Amount']])

print("'Time' column removed and 'Amount' column scaled in test_copy.")

'Time' column removed and 'Amount' column scaled in test_copy.


## 테스트 데이터 예측

### Subtask:
하이퍼파라미터 튜닝을 통해 최적화된 LightGBM 모델 (`tuned_lgbm_clf`)을 사용하여 전처리된 테스트 데이터에 대한 예측 확률을 생성하고, 예측 확률을 기반으로 최종 클래스 (`Class`)를 결정합니다.


In [56]:
test_predictions_proba = tuned_lgbm_clf.predict_proba(test_copy.drop('id', axis=1))[:, 1]
test_predictions_class = (test_predictions_proba > 0.5).astype(int)

print("Predicted probabilities and class labels generated for the test data.")

ValueError: Number of features of the model must match the input. Model n_features_ is 30 and input n_features is 29

In [57]:
train_copy = train_df.copy()
train_copy.drop('Time', axis=1, inplace=True)
train_copy.drop('id', axis=1, inplace=True) # Drop 'id' from training features
X_features = train_copy.iloc[:, :-1]
y_target = train_copy.iloc[:, -1]

print("X_features and y_target have been updated with cleaned data.")

X_features and y_target have been updated with cleaned data.


In [58]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, y_target, test_size=0.2, random_state=42, stratify=y_target)

print("Data successfully split into training and validation sets.")

Data successfully split into training and validation sets.


In [59]:
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])
X_val['Amount'] = scaler.transform(X_val[['Amount']])

print("StandardScaler reapplied to 'Amount' feature in X_train and X_val.")

StandardScaler reapplied to 'Amount' feature in X_train and X_val.


In [60]:
from lightgbm import LGBMClassifier

# Extract best hyperparameters from best_lgbm dictionary
best_n_estimators = int(best_lgbm['n_estimators'])
best_learning_rate = best_lgbm['learning_rate']
best_num_leaves = int(best_lgbm['num_leaves'])
best_max_depth = int(best_lgbm['max_depth'])

# Initialize and train LightGBM Classifier with best hyperparameters
tuned_lgbm_clf = LGBMClassifier(
    n_estimators=best_n_estimators,
    learning_rate=best_learning_rate,
    num_leaves=best_num_leaves,
    max_depth=best_max_depth,
    scale_pos_weight=scale_pos_weight_value, # Reuse previously calculated value
    n_jobs=-1,
    boost_from_average=False,
    random_state=42
)
tuned_lgbm_clf.fit(X_train, y_train)

# Predict on the validation set using the tuned model
tuned_lgbm_val = tuned_lgbm_clf.predict(X_val)
tuned_lgbm_val_proba = tuned_lgbm_clf.predict_proba(X_val)[:, 1]

print("LightGBM Classifier re-trained with best hyperparameters and predictions made on validation set.")

LightGBM Classifier re-trained with best hyperparameters and predictions made on validation set.


In [61]:
test_predictions_proba = tuned_lgbm_clf.predict_proba(test_copy.drop('id', axis=1))[:, 1]
test_predictions_class = (test_predictions_proba > 0.5).astype(int)

print("Predicted probabilities and class labels generated for the test data.")

Predicted probabilities and class labels generated for the test data.


In [62]:
submission_df = pd.DataFrame({'id': test_df['id'], 'Class': test_predictions_class})
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
