## **실습 4. AI 모델링 최적화**
## 본 실습파일은 <u><b>학습자용</u> 입니다.
* 본 과정에서는 웹페이지에서 추출한 Feature(특징) 기반으로 악성사이트를 탐지하는 머신러닝 분류문제를 예제코드를 통해서 해결할 것입니다.
---


### **[실습 프로세스]**
### 0. 데이터 불러오기
### 1. 데이터 전처리
### 2. train_test_split을 이용하여, train_x, test_x, train_y, test_y로 데이터 분리
### 3. GridSearch 활용 AI모델링



# <b>Step 0. 라이브러리 import 및 데이터 불러오기
### **가. 라이브러리 import**

* 데이터 프레임 관련 라이브러리

In [64]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

### **나.  학습데이터 불러오기**

In [65]:
df = pd.read_csv("df_train.csv", sep = ",")
x = df.drop("label", axis = 1)
y = df.loc[:, "label"]

### **다.  데이터 전처리**

### **라. train_test_split을 이용하여 train/test  데이터 분리**

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2021)

In [67]:
# 불러오기
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

### max_depth 튜닝 전

In [68]:
# 선언하기
model_xgb = XGBClassifier(random_state = 2021)

# 성능예측
cv_score = cross_val_score(model_xgb, x_train, y_train, cv = 10)

# 결과확인
print("평균 : ", cv_score.mean())

평균 :  0.9600810289986302


In [69]:
#파라미터 선언
param = {"max_depth" : range(1, 21)}

#GridSearch 선언
model = GridSearchCV(model_xgb, param, cv = 10, scoring = "accuracy")

In [70]:
#학습하기
model.fit(x_train, y_train)

In [71]:
# mean_test_score 확인
print("test_size = 0.2, random_state = 2021")
print('=' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

test_size = 0.2, random_state = 2021
최적파라미터: {'max_depth': 11}
--------------------------------------------------------------------------------
최고성능: 0.9617886745141743


### max_depth 하이퍼 파라미터

In [72]:
# 선언하기
model = XGBClassifier(max_depth = 11, random_state = 2021)

# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv = 10)

# 결과확인
print("평균 : ", cv_score.mean())

평균 :  0.9617886745141743


### 정확도

In [73]:
# 선언하기
model = XGBClassifier(max_depth = 11, random_state = 1)

# 3단계: 학습하기
model.fit(x_train, y_train)

# 4단계: 예측하기
y_pred = model.predict(x_test)

# 5단계 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[349   9]
 [ 10 365]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       358
           1       0.98      0.97      0.97       375

    accuracy                           0.97       733
   macro avg       0.97      0.97      0.97       733
weighted avg       0.97      0.97      0.97       733



### n_estimators 하이퍼 파라미터

In [74]:
# 선언하기
model_xgb = XGBClassifier(max_depth = 11, random_state = 2021)

# 성능예측
cv_score = cross_val_score(model_xgb, x_train, y_train, cv = 10)

# 결과확인
print("평균 : ", cv_score.mean())

평균 :  0.9617886745141743


In [75]:
#파라미터 선언
param = {"n_estimators" : range(100, 501, 20)}

#GridSearch 선언
model = GridSearchCV(model_xgb, param, cv = 10, scoring = "accuracy")

In [76]:
#학습하기
model.fit(x_train, y_train)

In [77]:
# mean_test_score 확인
print("test_size = 0.2, max_depth = 11, random_state = 2021")
print('=' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

test_size = 0.2, max_depth = 11, random_state = 2021
최적파라미터: {'n_estimators': 100}
--------------------------------------------------------------------------------
최고성능: 0.9617886745141743


### learning_rate 하이퍼 파라미터

In [78]:
# 선언하기
model_xgb = XGBClassifier(max_depth = 11, random_state = 2021)

# 성능예측
cv_score = cross_val_score(model_xgb, x_train, y_train, cv = 10)

# 결과확인
print("평균 : ", cv_score.mean())

평균 :  0.9617886745141743


In [79]:
#파라미터 선언
param = {"learning_rate" : [i * 0.01 for i in range(1, 60, 3)]}

#GridSearch 선언
model = GridSearchCV(model_xgb, param, cv = 10, scoring = "accuracy")

In [80]:
#학습하기
model.fit(x_train, y_train)

In [81]:
# mean_test_score 확인
print("test_size = 0.2, max_depth = 11, random_state = 2021")
print('=' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

test_size = 0.2, max_depth = 11, random_state = 2021
최적파라미터: {'learning_rate': 0.31}
--------------------------------------------------------------------------------
최고성능: 0.9624701074969237


In [82]:
# 선언하기
model = XGBClassifier(max_depth = 11, learning_rate = 0.31, random_state = 2021)

# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv = 10)

# 결과확인
print("평균 : ", cv_score.mean())

평균 :  0.9624701074969237


### 정확도

In [83]:
# 선언하기
model = XGBClassifier(max_depth = 11, learning_rate = 0.31, random_state = 1)

# 3단계: 학습하기
model.fit(x_train, y_train)

# 4단계: 예측하기
y_pred = model.predict(x_test)

# 5단계 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[348  10]
 [  9 366]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       358
           1       0.97      0.98      0.97       375

    accuracy                           0.97       733
   macro avg       0.97      0.97      0.97       733
weighted avg       0.97      0.97      0.97       733



---

### 제출용

In [84]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2931 entries, 679 to 1140
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   url_len                  2931 non-null   float64
 1   url_num_hyphens_dom      2931 non-null   float64
 2   url_path_len             2931 non-null   float64
 3   url_domain_len           2931 non-null   float64
 4   url_hostname_len         2931 non-null   float64
 5   url_num_dots             2931 non-null   float64
 6   url_num_underscores      2931 non-null   float64
 7   url_query_len            2931 non-null   float64
 8   url_num_query_para       2931 non-null   float64
 9   url_ip_present           2931 non-null   int64  
 10  url_entropy              2931 non-null   float64
 11  url_port                 2931 non-null   int64  
 12  html_num_tags('iframe')  2931 non-null   float64
 13  html_num_tags('script')  2931 non-null   float64
 14  html_num_tags('div')  

In [87]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2441 entries, 0 to 2440
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   url_len                  2441 non-null   int64  
 1   url_num_hyphens_dom      2441 non-null   int64  
 2   url_path_len             2441 non-null   float64
 3   url_domain_len           2441 non-null   float64
 4   url_hostname_len         2441 non-null   float64
 5   url_num_dots             2441 non-null   int64  
 6   url_num_underscores      2441 non-null   int64  
 7   url_query_len            2441 non-null   int64  
 8   url_num_query_para       2441 non-null   int64  
 9   url_ip_present           2441 non-null   int64  
 10  url_entropy              2441 non-null   float64
 11  url_port                 2441 non-null   int64  
 12  html_num_tags('iframe')  2441 non-null   int64  
 13  html_num_tags('script')  2441 non-null   float64
 14  html_num_tags('div')    

In [85]:
df_test = pd.read_csv("df_test.csv", sep = ",")

In [101]:
sub = pd.read_csv("sample_submission.csv", sep = ",", index_col = 0)

In [97]:
y_pred = model.predict(df_test)

In [102]:
sub["label"] = y_pred

In [103]:
sub.replace(0, 'benign', inplace = True)
sub.replace(1, 'malicious', inplace = True)

In [105]:
sub.to_csv("pred.csv", index = False)