In [1]:
# prompt: drive connect

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/Training.csv'

data = pd.read_csv(file_path)

data.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,https://www.todayshomeowner.com/how-to-make-ho...,82,23,0,2,7,0,0,0,0,...,1,1,0,240,8892,67860,0,1,4,legitimate
1,http://thapthan.ac.th/information/confirmation...,93,14,1,2,0,0,0,0,0,...,1,0,1,0,2996,4189860,0,1,2,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,121,21,1,3,0,0,0,0,0,...,1,1,0,30,2527,346022,0,1,3,phishing
3,https://www.bedslide.com,24,16,0,2,0,0,0,0,0,...,0,0,0,139,7531,1059151,0,0,4,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,73,24,0,3,1,0,0,0,0,...,0,0,0,3002,7590,635,0,1,5,legitimate


In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

label_encoder = LabelEncoder()
data['status'] = label_encoder.fit_transform(data['status'])

#split the data into features and labels

X = data.drop(columns=['url','status'])
y = data['status']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((6126, 87), (1532, 87), (6126,), (1532,))

In [16]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import dump


# Encode the 'status' column
label_encoder = LabelEncoder()
data['status'] = label_encoder.fit_transform(data['status'])

def extract_features(url):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname if parsed_url.hostname else ''
    path = parsed_url.path if parsed_url.path else ''

    features = {
        'length_url': len(url),
        'length_hostname': len(hostname),
        'ip': int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', hostname))),
        'nb_dots': url.count('.'),
        'nb_hyphens': url.count('-'),
        'nb_at': url.count('@'),
        'nb_qm': url.count('?'),
        'nb_and': url.count('&'),
        'nb_or': url.count('|'),
        'nb_eq': url.count('='),
        'nb_underscore': url.count('_'),
        'nb_tilde': url.count('~'),
        'nb_percent': url.count('%'),
        'nb_slash': url.count('/'),
        'nb_star': url.count('*'),
        'nb_colon': url.count(':'),
        'nb_comma': url.count(','),
        'nb_semicolon': url.count(';'),
        'nb_dollar': url.count('$'),
        'nb_space': url.count(' '),
        'nb_www': url.count('www'),
        'nb_com': url.count('.com'),
        'nb_dslash': url.count('//'),
        'http_in_path': int('http' in path.lower()),
        'https_token': int('https' in path.lower()),
        'ratio_digits_url': sum(c.isdigit() for c in url) / len(url),
        'ratio_digits_host': sum(c.isdigit() for c in hostname) / len(hostname),
        'nb_redirection': url.count('//'),
        'length_words_raw': len(re.findall(r'\w+', url)),
        'char_repeat': max([len(m.group(0)) for m in re.finditer(r'(.)\1*', url)], default=0),
        'shortest_word_length': min([len(word) for word in re.findall(r'\w+', url)], default=0),
        'longest_word_length': max([len(word) for word in re.findall(r'\w+', url)], default=0),
        'avg_word_length': np.mean([len(word) for word in re.findall(r'\w+', url)]) if re.findall(r'\w+', url) else 0
    }

    feature_order = [
        'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
        'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolon', 'nb_dollar',
        'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host',
        'nb_redirection', 'length_words_raw', 'char_repeat', 'shortest_word_length', 'longest_word_length', 'avg_word_length'
    ]

    return [features[feature] for feature in feature_order]

# Extract features from the dataset
data['features'] = data['url'].apply(extract_features)
features = np.array(data['features'].tolist())

# Split the data into features and labels
X = features
y = data['status']

# Standardize the feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Save the model and scaler
model_path = '/content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/url_classifier_model.joblib'
scaler_path = '/content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/scaler.joblib'
dump(rf_model, model_path)
dump(scaler, scaler_path)

print(f"Model and scaler saved successfully at {model_path} and {scaler_path}")


Model and scaler saved successfully at /content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/url_classifier_model.joblib and /content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/scaler.joblib


In [20]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from joblib import load

# Load the trained model and scaler
model_path = '/content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/url_classifier_model.joblib'
scaler_path = '/content/drive/MyDrive/CyberThreatAnalysisAndMitigation/dataset/model/scaler.joblib'
model = load(model_path)
scaler = load(scaler_path)

def extract_features(url):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname if parsed_url.hostname else ''
    path = parsed_url.path if parsed_url.path else ''

    features = {
        'length_url': len(url),
        'length_hostname': len(hostname),
        'ip': int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', hostname))),
        'nb_dots': url.count('.'),
        'nb_hyphens': url.count('-'),
        'nb_at': url.count('@'),
        'nb_qm': url.count('?'),
        'nb_and': url.count('&'),
        'nb_or': url.count('|'),
        'nb_eq': url.count('='),
        'nb_underscore': url.count('_'),
        'nb_tilde': url.count('~'),
        'nb_percent': url.count('%'),
        'nb_slash': url.count('/'),
        'nb_star': url.count('*'),
        'nb_colon': url.count(':'),
        'nb_comma': url.count(','),
        'nb_semicolon': url.count(';'),
        'nb_dollar': url.count('$'),
        'nb_space': url.count(' '),
        'nb_www': url.count('www'),
        'nb_com': url.count('.com'),
        'nb_dslash': url.count('//'),
        'http_in_path': int('http' in path.lower()),
        'https_token': int('https' in path.lower()),
        'ratio_digits_url': sum(c.isdigit() for c in url) / len(url),
        'ratio_digits_host': sum(c.isdigit() for c in hostname) / len(hostname),
        'nb_redirection': url.count('//'),
        'length_words_raw': len(re.findall(r'\w+', url)),
        'char_repeat': max([len(m.group(0)) for m in re.finditer(r'(.)\1*', url)], default=0),
        'shortest_word_length': min([len(word) for word in re.findall(r'\w+', url)], default=0),
        'longest_word_length': max([len(word) for word in re.findall(r'\w+', url)], default=0),
        'avg_word_length': np.mean([len(word) for word in re.findall(r'\w+', url)]) if re.findall(r'\w+', url) else 0
    }

    feature_order = [
        'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
        'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolon', 'nb_dollar',
        'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host',
        'nb_redirection', 'length_words_raw', 'char_repeat', 'shortest_word_length', 'longest_word_length', 'avg_word_length'
    ]

    return [features[feature] for feature in feature_order]

# Test URL
url = 'https://en.wikipedia.org/wiki/NBC_Nightly_News'

# Extract features
features = extract_features(url)

# Ensure the feature count matches the training data
print(f"Feature count: {len(features)}")

# Scale features
features_scaled = scaler.transform([features])

# Predict
prediction = model.predict(features_scaled)
output = 'malicious' if prediction[0] == 1 else 'legitimate'

print(f'The URL "{url}" is {output}')


Feature count: 33
The URL "https://en.wikipedia.org/wiki/NBC_Nightly_News" is legitimate
