In [10]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder



In [3]:
# Load the data
df = pd.read_csv("shuffled_combined_dataset.csv")



In [13]:
# Convert URLs to character level encoding and pad to length of 200
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['urls'])
encoded_urls = tokenizer.texts_to_sequences(df['urls'])
padded_urls = pad_sequences(encoded_urls, maxlen=200)




In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_urls, df['type'], test_size=0.2, random_state=42)



In [21]:
# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=200))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Encode the labels to numerical values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
print(label_encoder.classes_)


['Phish' 'safe']


In [16]:
# Train the CNN model with encoded labels
model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Extract features from the CNN model
feature_extractor = Sequential(model.layers[:-2])  # Exclude the last Dense and Dropout layers
X_train_features = feature_extractor.predict(X_train)
X_test_features = feature_extractor.predict(X_test)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_features, y_train)

# Predict using the RF classifier
y_pred = rf_classifier.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.99


In [28]:
# Assuming your single URL is stored in a variable named single_url

# 1. Preprocess the URL
encoded_single_url = tokenizer.texts_to_sequences(["	http://qw.nelidacoassssio.repl.co/"])
padded_single_url = pad_sequences(encoded_single_url, maxlen=200)

# 2. Feature Extraction with CNN
single_url_features = feature_extractor.predict(padded_single_url)

# 3. Prediction with Random Forest
predicted_label = rf_classifier.predict(single_url_features)[0]
print(predicted_label)


Phish


In [29]:

# 1. Save the CNN model
cnn_model_path = "cnn_model.h5"
model.save(cnn_model_path)

# 2. Save the Random Forest model
import joblib

rf_model_path = "rf_model.pkl"
joblib.dump(rf_classifier, rf_model_path)


  saving_api.save_model(


['rf_model.pkl']

In [30]:
from tensorflow.keras.models import load_model

# Load the CNN model from the saved file
cnn_model = load_model("cnn_model.h5")


In [31]:
import joblib

# Load the Random Forest model from the saved file
rf_classifier = joblib.load("rf_model.pkl")


In [40]:
def predict_url(single_url):
    # Preprocess the URL
    encoded_single_url = tokenizer.texts_to_sequences([single_url])
    padded_single_url = pad_sequences(encoded_single_url, maxlen=200)

    # Extract features using the feature_extractor from the CNN model
    url_features = feature_extractor.predict(padded_single_url)

    # Predict using the Random Forest model
    predicted_label = rf_classifier.predict(url_features)[0]

    return predicted_label

# Example
url_to_predict = "https://www.google.com"
print(predict_url(url_to_predict))



safe
