In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Flatten, Dense, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
url_data = pd.read_csv('/content/drive/MyDrive/project/data/malicious_phish.csv')

In [None]:
lb_make = LabelEncoder()
url_data["type_code"] = lb_make.fit_transform(url_data["type"])
url_data["type_code"].value_counts()

urls = url_data['url']
labels = url_data['type_code']

In [None]:
data = url_data.drop(columns=['type'])

In [None]:
data['url'] = data['url'].drop_duplicates().reset_index(drop=True)

In [None]:
data = data.dropna()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641119 entries, 0 to 641118
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   url        641119 non-null  object
 1   type_code  641119 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 14.7+ MB


In [None]:
urls = url_data['url']
labels = url_data['type_code']

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(urls)
url_sequences = tokenizer.texts_to_sequences(urls)

max_len = max(len(seq) for seq in url_sequences)
padded_sequences = pad_sequences(url_sequences, maxlen=max_len, padding='post')
num_classes = len(np.unique(labels))

labels_one_hot = to_categorical(labels, num_classes)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_one_hot, test_size=0.2, random_state=42)

In [None]:
best_model_path='/content/drive/MyDrive/javas_project/data/best_model2/dm_{epoch:02d}_{val_accuracy:0.2f}.hdf5'
bmc = ModelCheckpoint(filepath=best_model_path,verbose=1,save_best_only=True,monitor='val_accuracy')
bes = EarlyStopping(monitor='val_accuracy',verbose=1,patience=10)

In [None]:
model = Sequential()

model.add(Embedding(len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len))

model.add(LSTM(units=100, return_sequences=True))

model.add(GlobalMaxPooling1D())

model.add(Dense(units=32, activation='relu'))

model.add(Dense(units=64, activation='relu'))

model.add(Dense(units=32, activation='relu'))

model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train, y_train, epochs=1, validation_split=0.2, batch_size=32)

In [None]:
model.evaluate(X_test,y_test)