<a href="https://colab.research.google.com/github/BAEEUNMANGDUCK/ObjectDetection_Streamlit_Deployment/blob/main/web_page_dataset_phishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [8]:
df_data = pd.read_csv('./dataset_phishing.csv')
df_data.shape

(11430, 89)

In [9]:
df_data['status']

Unnamed: 0,status
0,legitimate
1,phishing
2,phishing
3,legitimate
4,legitimate
...,...
11425,legitimate
11426,phishing
11427,legitimate
11428,legitimate


#### 범주형 변수를 수치형 변수로 변환

In [10]:
pd.get_dummies(df_data['status'])

Unnamed: 0,legitimate,phishing
0,True,False
1,False,True
2,False,True
3,True,False
4,True,False
...,...,...
11425,True,False
11426,False,True
11427,True,False
11428,True,False


In [11]:
df_data['target'] = pd.get_dummies(df_data['status'])['legitimate'].astype('int')
df_data.drop('status', axis=1, inplace=True)
df_data[['url', 'target']].head(5)

Unnamed: 0,url,target
0,http://www.crestonwood.com/router.php,1
1,http://shadetreetechnology.com/V4/validation/a...,0
2,https://support-appleld.com.secureupdate.duila...,0
3,http://rgipt.ac.in,1
4,http://www.iracing.com/tracks/gateway-motorspo...,1


In [16]:
from sklearn.model_selection import train_test_split

# target 이전까지의 feature
X = df_data.iloc[:, 1: -1]

# target 데이터 가져오기
y = df_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(9144, 87) (9144,)
(2286, 87) (2286,)


In [17]:
X_train.head()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
116,90,19,1,4,0,0,0,0,0,0,...,0,1,1,0,79,2112,0,0,1,1
11429,477,14,1,24,0,1,1,9,0,9,...,0,1,1,1,0,-1,0,1,1,0
2101,34,26,0,2,0,0,0,0,0,0,...,0,1,0,0,113,3539,0,0,0,3
7135,164,17,1,3,5,1,0,2,0,1,...,0,1,1,0,540,-1,0,0,1,0
7327,99,15,0,2,0,0,0,0,0,0,...,0,1,1,0,2975,8347,1,0,1,10


In [18]:
y_train.head()

Unnamed: 0,target
116,0
11429,0
2101,1
7135,0
7327,0


#### feature 정규화 (범위가 다른 fature의 값 범위를 평균은 0, 표준편차는 1로 하여 만듬)

In [19]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train_tensor = torch.from_numpy(std_scaler.transform(X_train)).float()
X_test_tensor = torch.from_numpy(std_scaler.transform(X_test)).float()
y_train_tensor = torch.from_numpy(y_train.values).float()
y_train_tensor = y_train_tensor.unsqueeze(1) # 열벡터를 행렬로 변환
y_test_tensor = torch.from_numpy(y_test.values).float()
y_test_tensor = y_test_tensor.unsqueeze(1) # 열벡터를 행렬로 변환

In [20]:
nb_epochs = 1000
minibatch_size = 256

In [21]:
class FunModel(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    self.linear_layers = nn.Sequential(
        nn.Linear(input_dim, 200),
        nn.LeakyReLU(0.1),
        nn.Linear(200, 100),
        nn.LeakyReLU(0.1),
        nn.Linear(100, 20),
        nn.LeakyReLU(0.1),
        nn.Linear(20, 5),
        nn.LeakyReLU(0.1),
        nn.Linear(5, output_dim),
        nn.Sigmoid()
    )

  def forward(self, x):
    y = self.linear_layers(x)
    return y


In [22]:
X_train_tensor.size()

torch.Size([9144, 87])

In [25]:
input_dim = X_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
print(input_dim, output_dim)
model = FunModel(input_dim, output_dim)
loss_func = nn.BCELoss() # BCELoss 사용을 위해 마지막은 반드시 sigmoid
optimizer = torch.optim.Adam(model.parameters())


87 1


In [None]:
for index in range(nb_epochs):
  indices = torch.randperm(X_train_tensor.size(0))

  x_batch_list = torch.index_select(X_train_tensor, 0, index=indices)
  y_batch_list = torch.index_select(y_train_tensor, 0, index=indices)

  x_batch_list = x_batch_list.split(minibatch_size, 0)
  y_batch_list = y_batch_list.split(minibatch_size, 0)
  epoch_loss = list()
  for x_minibatch, y_minibatch in zip(x_batch_list, y_batch_list):
    y_minibatch_pred = model(x_minibatch)

    loss = loss_func(y_minibatch_pred, y_minibatch)
    epoch_loss.append(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  if (index % 100) == 0:
    print(index, sum(epoch_loss)/ len(epoch_loss))

print(loss)

0 tensor(0.6097, grad_fn=<DivBackward0>)
100 tensor(0.0977, grad_fn=<DivBackward0>)
200 tensor(0.0977, grad_fn=<DivBackward0>)
300 tensor(0.0977, grad_fn=<DivBackward0>)
400 tensor(0.0109, grad_fn=<DivBackward0>)
500 tensor(0.0109, grad_fn=<DivBackward0>)
600 tensor(0.0109, grad_fn=<DivBackward0>)
700 tensor(0.0109, grad_fn=<DivBackward0>)
800 tensor(0.0109, grad_fn=<DivBackward0>)
900 tensor(0.0109, grad_fn=<DivBackward0>)


In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
  y_test_pred_sigmoid = model(X_test_tensor)
  y_test_pred = torch.round(y_test_pred_sigmoid)

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

print("Confusion Matrix\n", str(confusion_matrix(y_test_tensor, y_test_pred)))
print("Precision:\t", str(precision_score(y_test_tensor, y_test_pred)))
print("Recall:\t", str(recall_score(y_test_tensor, y_test_pred)))
print("F1 Score:\t", str(f1_score(y_test_tensor, y_test_pred)))