In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [3]:
import pandas as pd

# 저장된 피처 튜닝 데이터 불러오기
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/악성 URL/DATA/train_FE')

In [4]:
train_df = train.loc[:,['URL', 'label', 'length', 'subdomain_count', 'special_char_count',
       'number_of_meaning_words', 'tld_malicious', 'path_depth',
       'max_numeric_sequence','https', 'blacklist_word_count',
       'digit_count', 'suspicious_keyword_flag']]


In [5]:
X,y = train_df.drop(columns=['URL','label']),train_df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
# LightGBM 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200],  # 트리 개수
    'learning_rate': [0.01, 0.1],  # 학습률
    'max_depth': [20, 30],  # 최대 깊이
    'num_leaves': [31, 50],  # 리프 노드 수
    'min_child_samples': [25, 50, 100],  # 리프 노드의 최소 샘플 수
    'colsample_bytree': [0.8, 1.0]  # 트리별 특성 샘플링 비율
}

# GridSearchCV 실행
grid_search = GridSearchCV(
    lgb.LGBMClassifier(),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train[:100000], y_train[:100000])

# 최적의 파라미터 출력
print('최적의 파라미터 조합:', grid_search.best_params_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[LightGBM] [Info] Number of positive: 22394, number of negative: 77606
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 549
[LightGBM] [Info] Number of data points in the train set: 100000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.223940 -> initscore=-1.242852
[LightGBM] [Info] Start training from score -1.242852
최적의 파라미터 조합: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 30, 'min_child_samples': 25, 'n_estimators': 200, 'num_leaves': 50}


In [7]:
# 최적의 LightGBM 모델 가져오기
LGBM_model = grid_search.best_estimator_

# 테스트 데이터에 대한 예측 확률
y_pred_proba = LGBM_model.predict_proba(X_test)[:, 1]

# ROC AUC 점수 계산
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'ROC AUC Score: {roc_auc}')

ROC AUC Score: 0.9276794504586703
