In [1]:
from transformers import BertTokenizer
import pandas as pd
from tensorflow.keras import layers, models
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report

In [2]:
def split_data(df):
    from sklearn.model_selection import train_test_split
    x = df['text']
    y = df['label']
    return train_test_split(x,y,test_size = 0.2,random_state=42)

def token_data(texts, tokenizer, max_length=512):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings

def performance(y_true,y_pred):
# y_pred son los valores predecidos mientras que y_true son los valores reales
    accuracy = accuracy_score(y_true,y_pred)
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(classification_report(y_true, y_pred))
    return{
        'accuracy':accuracy,
        'recall':recall,
        'f1':f1,
    }


def create_model(inputs, num,  num_filters=128, kernel_size=5, pool_size=2):
    model = models.Sequential()
    model.add(layers.Conv1D(num_filters, kernel_size, activation='gelu', input_shape=inputs))
    model.add(layers.MaxPooling1D(pool_size=pool_size))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv1D(num_filters * 2, kernel_size, activation='gelu'))
    model.add(layers.MaxPooling1D(pool_size=pool_size))
    model.add(layers.BatchNormalization())
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(128, activation = 'gelu'))
    model.add(layers.Dense(num,activation = 'softmax'))
    return model

In [7]:
def categorize(text):
    categories = {
        "malware": "Malware",
        "phishing": "Phishing",
        "ransomware": "Ransomware",
        "trojan": "Trojan",
        "worm": "Worm",
        "spyware": "Spyware",
        "ddos": "DDoS",
        "distributed denial of service": "DDoS",
        "zero day": "Zero Days",
        "data breach": "Data Breach",
        "social engineering": "Social Engineering"
    }
    lower_text = text.lower()
    for keyword, category in categories.items():
        if keyword in lower_text:
            return category
    return "Other"

def labeler(df):
    df["label"] = df["text"].apply(categorize)
    return df

In [8]:
def main():
    # LOAD
    #df = clean_data.limpieza(ldata.ldata1(), ldata.ldata2(), ldata.ldata3())
    df = pd.read_csv('/clean_data.csv')
    print("data loaded")
    #LABEL
    df = labeler(df)
    print("data labeled")
    # SPLIT
    x_train, x_test, y_train, y_test = split_data(df)
    print("data splitted")
    # TOKENIZE
    train_texts = x_train.tolist()
    val_texts = x_test.tolist()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_encodings = token_data(train_texts, tokenizer)
    val_encodings = token_data(val_texts,tokenizer)
    print("data tokenized")
    # Actually training the model
    y_train = pd.get_dummies(y_train).values
    y_test = pd.get_dummies(y_test).values
    inputs = (train_encodings['input_ids'].shape[1], 1)
    num = y_train.shape[1]
    print("creando modelo")
    model = create_model(inputs, num)

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(train_encodings['input_ids'],y_train,epochs=10, batch_size=32, validation_data=(val_encodings['input_ids'],y_test))
#    y_pred =[] # reemplazar
#    y_true = [] # reemplazar
#    metrics = performance(y_true, y_pred)
#    print(metrics)


if __name__ == "__main__":
    main()

data loaded
creando modelo
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 87ms/step - accuracy: 0.4700 - loss: 1.7194 - val_accuracy: 0.4890 - val_loss: 1.8989
Epoch 2/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5417 - loss: 1.3191 - val_accuracy: 0.5136 - val_loss: 1.4630
Epoch 3/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5796 - loss: 1.2156 - val_accuracy: 0.4021 - val_loss: 1.6372
Epoch 4/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5886 - loss: 1.1702 - val_accuracy: 0.5149 - val_loss: 1.4629
Epoch 5/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5862 - loss: 1.1715 - val_accuracy: 0.3930 - val_loss: 1.5434
Epoch 6/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6050 - loss: 1.1519 - val_accuracy: 0.3632 - val_loss: 1.6699
Epoch 7/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━