In [1]:
#ThreatSense - malicious_url training
#Neccessary packages imported for training the model and saving the results
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import re
import joblib 

In [2]:
# Loading the dataset to the program
df_phish = pd.read_csv('malicious_phish.csv') 

# Cleaning the null/NA values
df_phish.dropna(inplace=True)

In [3]:
# Data splitting for traning and testing
X_train_phish, X_test_phish, y_train_phish, y_test_phish = train_test_split(df_phish['url'], df_phish['type'], test_size=0.2, random_state=42) #random_state to initialize the random number generator

# Converting the data into numerical values for vectorization that allows ML algorithms to quantitive analysis
vectorizer_phish = TfidfVectorizer(max_features=5000) 
X_train_tfidf_phish = vectorizer_phish.fit_transform(X_train_phish)
X_test_tfidf_phish = vectorizer_phish.transform(X_test_phish)

# Training the model using RandomForest algorithm that uses 100 decision trees
rf_classifier_phish = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_phish.fit(X_train_tfidf_phish, y_train_phish)

# Checking the accuracy of the model
y_pred_phish = rf_classifier_phish.predict(X_test_tfidf_phish)
accuracy_phish = accuracy_score(y_test_phish, y_pred_phish)
print("Accuracy:", accuracy_phish)

Accuracy: 0.9539385283977918


In [4]:
# Saving ML model and data transformation settings for future predictions
joblib.dump(rf_classifier_phish, 'rf_classifier_url.pkl')
joblib.dump(vectorizer_phish, 'vectorizer_url.pkl')

['vectorizer_url.pkl']

In [None]:
#This is a function that allow us to test the trained model right after training
#without integrating it to any platform
def predict_url(input_url):
    tfidf_vector_phish = vectorizer_phish.transform([input_url])
    prediction_phish = rf_classifier_phish.predict(tfidf_vector_phish)
    return prediction_phish[0]

# Getting user input and predicting
input_url = input("Enter the URL: ")
prediction_phish = predict_url(input_url)
print("Prediction:", prediction_phish)