In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
dataset = pd.read_csv("/kaggle/input/malicious-urls-dataset/malicious_phish.csv")

# Extract the URL and label columns
urls = dataset['url'].values
labels = dataset['type'].values

# Convert labels to numerical format
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_urls, test_urls, train_labels, test_labels = train_test_split(urls, labels, test_size=0.2, random_state=42)

# Tokenize the URLs
max_len = 100  # Maximum sequence length
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_urls)
train_sequences = tokenizer.texts_to_sequences(train_urls)
test_sequences = tokenizer.texts_to_sequences(test_urls)

# Pad sequences to have the same length
train_data = pad_sequences(train_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)

# Build the CNN model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_len))
model.add(Conv1D(256, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(test_data, test_labels, batch_size=128)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.07434657961130142
Test accuracy: 0.9786008596420288


In [2]:

# Load new data for prediction
new_urls = ["web.whatsapp.com/"]  # List of new URLs

# Tokenize the new URLs
new_sequences = tokenizer.texts_to_sequences(new_urls)
new_data = pad_sequences(new_sequences, maxlen=max_len)

# Make predictions
predictions = model.predict(new_data)

# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Print the predicted labels
for url, label in zip(new_urls, predicted_labels):
    print(f"URL: {url} --> Predicted Label: {label}")


URL: web.whatsapp.com/ --> Predicted Label: benign
