In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from scipy.sparse import hstack
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import urllib.request
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
data = pd.read_csv('model_data.csv')
val_data = pd.read_csv('valid_data.csv')
val_data['url'] = val_data['url'].apply(str)

LightGBM
=============

In [36]:
# 훈련용 데이터
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(data['Tokenized_url'])

other_features = data.drop(columns=['url', 'Label', 'Tokenized_url'])
X_other = other_features.values

X = hstack([X_text, X_other])
y = data['Label']

GBM_X_train, GBM_X_test, GBM_y_train, GBM_y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 검증용 데이터
shuffled_val_data = val_data.sample(frac=1, random_state=0).reset_index(drop=True)

X_val_text = vectorizer.transform(shuffled_val_data['Tokenized_url'])

val_other_features = shuffled_val_data.drop(columns=['url', 'Label', 'Tokenized_url'])
X_val_other = val_other_features.values

GBM_X_val = hstack([X_val_text, X_val_other])

GBM_y_val = shuffled_val_data['Label']

In [7]:
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # with_mean=False is necessary for sparse matrix
    ('classifier', LGBMClassifier())
])

param_grid = {
    'classifier__num_leaves': [31, 63],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [-1, 15, 30]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 89647, number of negative: 95949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3783
[LightGBM] [Info] Number of data points in the train set: 185596, number of used features: 1515
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.483022 -> initscore=-0.067937
[LightGBM] [Info] Start training from score -0.067937
[LightGBM] [Info] Number of positive: 89647, number of negative: 95949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3816
[LightGBM] [Info] Number of data points in the train set: 185596, number of used features: 1543
[LightGBM] [

In [37]:
# 최적의 하이퍼파라미터로 모델 설정
best_params = {
    'learning_rate': 0.1,
    'max_depth': -1,
    'n_estimators': 100,
    'num_leaves': 31
}

# 최적의 모델 훈련
LGBM_best_model = LGBMClassifier(**best_params)
LGBM_best_model.fit(GBM_X_train, GBM_y_train)

[LightGBM] [Info] Number of positive: 112058, number of negative: 119937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4772
[LightGBM] [Info] Number of data points in the train set: 231995, number of used features: 1971
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.483019 -> initscore=-0.067950
[LightGBM] [Info] Start training from score -0.067950


CNN
=======

In [24]:
#독립 변수
X_data = data['url']

# 종속 변수
y_data = data['Label']

# 검증용 데이터
shuffled_val_data = val_data.sample(frac=1, random_state=0).reset_index(drop=True)
X_val = shuffled_val_data['url']
y_val = shuffled_val_data['Label']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0, stratify=y_data)

In [13]:
# URL을 구성 요소로 분리하는 함수
def tokenize_url(url):
    parsed_url = urlparse(url)
    scheme = parsed_url.scheme
    netloc = parsed_url.netloc
    path = parsed_url.path

    url_parts = [scheme] + netloc.split('.') + path.split('/')

    url_parts = [part for part in url_parts if part]

    return url_parts

X_train_list = [tokenize_url(url) for url in X_train]

X_train_list = [' '.join(parts) for parts in X_train_list]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_list)

X_train_encoded = tokenizer.texts_to_sequences(X_train_list)

word_to_index = tokenizer.word_index

In [16]:
threshold = 2
total_cnt = len(word_to_index) 
rare_cnt = 0 
total_freq = 0 
rare_freq = 0 

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

vocab_size = len(word_to_index) + 1

max_len = 560
X_train_padded = pad_sequences(X_train_encoded, maxlen = max_len)

In [17]:
embedding_dim = 32
dropout_ratio = 0.3
num_filters = 32
kernel_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.keras', monitor = 'val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[es, mc])

Epoch 1/10
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - acc: 0.9277 - loss: 0.1964
Epoch 1: val_acc improved from -inf to 0.97937, saving model to best_model.keras
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 27ms/step - acc: 0.9277 - loss: 0.1964 - val_acc: 0.9794 - val_loss: 0.0656
Epoch 2/10
[1m2899/2900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - acc: 0.9882 - loss: 0.0355
Epoch 2: val_acc improved from 0.97937 to 0.98791, saving model to best_model.keras
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 27ms/step - acc: 0.9882 - loss: 0.0355 - val_acc: 0.9879 - val_loss: 0.0452
Epoch 3/10
[1m2899/2900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - acc: 0.9977 - loss: 0.0081
Epoch 3: val_acc improved from 0.98791 to 0.98862, saving model to best_model.keras
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 27ms/step - acc: 0.9977 - loss: 0.0081

In [41]:
from keras.models import load_model

# 저장된 최상의 모델 로드
best_model = load_model('best_model.keras')

# 검증 데이터 전처리
X_val_list = [tokenize_url(url) for url in X_val]
X_val_list = [' '.join(parts) for parts in X_val_list]
X_val_encoded = tokenizer.texts_to_sequences(X_val_list)
X_val_padded = pad_sequences(X_val_encoded, maxlen=max_len)

# 예측 수행
cnn_predictions = best_model.predict(X_val_padded)

# 예측 결과 확인
rounded_predictions = np.round(cnn_predictions).flatten()  # 예측값을 이진 클래스로 변환 (0 또는 1)
accuracy = np.mean(rounded_predictions == y_val)  # 정확도 계산
print(f'검증 데이터 정확도: {accuracy}')

# 예측 클래스 확인 (예시로 처음 10개 데이터에 대해 출력)
for i in range(10):
    print(f'실제값: {y_val.iloc[i]}, 예측값: {rounded_predictions[i]}')

[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
검증 데이터 정확도: 0.9375571428571429
실제값: 0, 예측값: 0.0
실제값: 1, 예측값: 1.0
실제값: 0, 예측값: 0.0
실제값: 1, 예측값: 1.0
실제값: 0, 예측값: 0.0
실제값: 1, 예측값: 1.0
실제값: 1, 예측값: 0.0
실제값: 0, 예측값: 0.0
실제값: 0, 예측값: 0.0
실제값: 0, 예측값: 0.0


In [40]:
# 검증 데이터에 대한 예측
lgb_predictions = LGBM_best_model.predict(GBM_X_val)
rounded_lgb_predictions = np.round(y_val_pred).astype(int)  # 이진 클래스로 변환

# 정확도 계산
lgb_accuracy = np.mean(rounded_lgb_predictions == GBM_y_val)
print(f'LightGBM 모델 검증 데이터 정확도: {lgb_accuracy}')

LightGBM 모델 검증 데이터 정확도: 0.9999857142857143




In [42]:
# 앙상블 예측: CNN 모델과 LightGBM 모델 예측값 평균
ensemble_predictions = (cnn_predictions.flatten() + lgb_predictions) / 2

# 예측값을 이진 클래스로 변환
rounded_ensemble_predictions = np.round(ensemble_predictions).astype(int)

# 정확도 계산
ensemble_accuracy = np.mean(rounded_ensemble_predictions == y_val)
print(f'앙상블 모델 검증 데이터 정확도: {ensemble_accuracy}')

앙상블 모델 검증 데이터 정확도: 0.9999857142857143
