# Redes Neurais Artificiais - Identificação de Websites de *Phishing*
Atividade Avaliativa 3 - *Hands-On*

#### Problema:
- Identificação de Websites *Phishing* por uso de Redes Neurais Artificiais e AutoML

#### Dataset:
- "[*Website Phishing*](https://archive.ics.uci.edu/dataset/379/website+phishing)" por Neda Abdelhamid

#### Equipe:
- Daniele Simas Guimaraes - 2015310060
- Felipe Muniz Amorim - 2115080033
- Jose Manuel Coelho Dos Santos - 2115080052

## Importação de bibliotecas

In [36]:
import pickle

import arff
import numpy as np
import pandas as pd

from autosklearn.classification import AutoSklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Carregando os dados

Classes dos atributos:

| Legítimo | Suspeito | Phishing |
|  ------  |  ------  |  ------  |
|    1     |     0    |    -1    |

In [20]:
dataset = None

with open('data/PhishingData.arff', 'r') as file:
    data = arff.load(file)

    dataset = pd.DataFrame(data['data'], columns=[attr[0] for attr in data['attributes']])

dataset

Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,1,-1,1,-1,-1,1,1,1,0,0
1,-1,-1,-1,-1,-1,0,1,1,1,1
2,1,-1,0,0,-1,0,-1,1,0,1
3,1,0,1,-1,-1,0,1,1,0,0
4,-1,-1,1,-1,0,0,-1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
1348,-1,-1,-1,-1,-1,-1,0,1,0,1
1349,-1,0,1,0,-1,0,0,1,0,-1
1350,-1,0,-1,-1,-1,0,-1,-1,0,1
1351,0,0,1,0,0,0,-1,1,0,1


In [21]:
print('# Tipos das colunas:')
print(dataset.dtypes, end='\n\n')

print("# Após conversão:")
dataset = dataset.astype(int)
print(dataset.dtypes)

# Tipos das colunas:
SFH                  object
popUpWidnow          object
SSLfinal_State       object
Request_URL          object
URL_of_Anchor        object
web_traffic          object
URL_Length           object
age_of_domain        object
having_IP_Address    object
Result               object
dtype: object

# Após conversão:
SFH                  int64
popUpWidnow          int64
SSLfinal_State       int64
Request_URL          int64
URL_of_Anchor        int64
web_traffic          int64
URL_Length           int64
age_of_domain        int64
having_IP_Address    int64
Result               int64
dtype: object


In [2]:
dataset['having_IP_Address'] = dataset['having_IP_Address'].replace(0, -1)
dataset.rename(columns={'popUpWidnow': 'popUp_Window'}, inplace=True)
dataset

NameError: name 'dataset' is not defined

## Holdout

In [22]:
# Atributo alvo
y = dataset["Result"]

# Atributos preditores
X = dataset.copy()
X.drop("Result", axis=1, inplace=True)

# Partição Holdout 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [23]:
X_train.head()

Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address
393,-1,-1,-1,-1,-1,1,-1,1,0
636,1,-1,0,0,-1,0,1,-1,0
640,1,0,1,-1,1,1,1,1,0
53,-1,-1,1,-1,-1,0,-1,-1,0
745,1,-1,1,1,0,-1,0,1,0


In [24]:
y_train.head()

393    1
636    1
640   -1
53     1
745   -1
Name: Result, dtype: int64

In [25]:
automl = AutoSklearnClassifier(
            time_left_for_this_task=60 * 60, # 1 Segundo * 60 * 60 = 1 hora
            memory_limit=6144, # 6 GB
        )

automl.fit(X_train, y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=6144, per_run_time_limit=360)

In [28]:
automl.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
359,2,0.04,gradient_boosting,0.083067,1.449062
443,1,0.02,gradient_boosting,0.083067,1.501583
444,3,0.02,gradient_boosting,0.083067,1.388006
345,7,0.02,gradient_boosting,0.086262,1.345865
358,5,0.02,gradient_boosting,0.086262,1.374081
446,8,0.04,gradient_boosting,0.086262,1.405422
900,6,0.04,libsvm_svc,0.086262,0.660658
946,4,0.02,libsvm_svc,0.086262,0.602786
328,11,0.04,gradient_boosting,0.089457,1.559526
367,10,0.04,gradient_boosting,0.089457,1.292766


In [33]:
def scores(test, pred):
    # Calcular as métricas de avaliação
    accuracy = metrics.accuracy_score(test, pred)
    precision = metrics.precision_score(test, pred, average='macro', zero_division=0)
    recall = metrics.recall_score(test, pred, average='macro')
    f1 = metrics.f1_score(test, pred, average='macro')

    # Exibir métricas de avaliação
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Revocação: {recall:.2f}" )
    print(f"F1-score: {f1:.2f}", )

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

pred = automl.predict(X_test)
auto_results = scores(y_test, pred)

Acurácia: 0.90
Precisão: 0.86
Revocação: 0.90
F1-score: 0.87


In [37]:
pickle.dump(automl, open('automl.pickle', 'wb'))