In [7]:
import re
from urllib.parse import urlparse
import pickle

In [8]:

def extract_url_features(url):
    # Parse the URL
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path
    query = parsed_url.query
    tld = domain.split('.')[-1] if '.' in domain else ""
    
    # Extract file name from path
    file_name = path.split('/')[-1] if '/' in path else path

    # Compute features
    features = [
        url.count('/'),                               # qty_slash_url
        len(url),                                     # length_url
        domain.count('.'),                            # qty_dot_domain
        len(domain),                                  # domain_length
        path.count('.'),                              # qty_dot_directory
        path.count('-'),                              # qty_hyphen_directory
        path.count('/'),                              # qty_slash_directory
        len(path),                                    # directory_length
        file_name.count('.'),                         # qty_dot_file
        file_name.count('?'),                         # qty_questionmark_file
        len(file_name),                               # file_length                                        
        1 if re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", url) else 0,  # email_in_url
        url.lower().count(f".{tld.lower()}"),         # qty_tld_url
        url.count('-')                                # qty_hyphen_url
    ]

    return features

# Example usage
url = "https://www.youtube.com"
url2="https:kismat@---///---//gmail.com"
features = [extract_url_features(url)]
features1= [extract_url_features(url2)]


In [9]:
print(features)

[[2, 23, 2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]]


In [10]:
print(features1)

[[5, 33, 0, 0, 1, 6, 5, 27, 1, 0, 9, 0, 1, 6]]


In [11]:
with open("phishing_detection_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
print("Model loaded successfully!")

Model loaded successfully!


In [12]:
result=loaded_model.predict(features)

In [13]:
if result==1:
    print("Phishing website.")
    print(result)

else:
    print("Not a phishing website.")
    print(result)

Not a phishing website.
[0.]
