<a href="https://colab.research.google.com/github/2003UJAN/Phising-Website/blob/main/url_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from urllib.parse import urlparse,parse_qs
import csv
from google.colab import files

In [49]:
#loading the dataset
data = pd.read_csv("/Phishing_Legitimate_full.csv")

In [50]:
#data pre-processing
data = data.drop(data.columns[0], axis=1)
data = shuffle(data)
threshold = 0.9 * len(data)
data = data.dropna(thresh=threshold, axis=1)

In [51]:
first_row = data.iloc[0]

In [52]:
print(first_row)

NumDots                                1.000000
SubdomainLevel                         0.000000
PathLevel                              4.000000
UrlLength                             74.000000
NumDash                                3.000000
NumDashInHostname                      0.000000
AtSymbol                               0.000000
TildeSymbol                            0.000000
NumUnderscore                          0.000000
NumPercent                             0.000000
NumQueryComponents                     1.000000
NumAmpersand                           0.000000
NumHash                                0.000000
NumNumericChars                        7.000000
NoHttps                                1.000000
RandomString                           1.000000
IpAddress                              0.000000
DomainInSubdomains                     0.000000
DomainInPaths                          1.000000
HttpsInHostname                        0.000000
HostnameLength                        10

In [53]:
urls = [
    'https://www.nitk.ac.in/',
    'https://www.meity.gov.in/',
    'https://www.srmist.edu.in/'
]

In [54]:
def extract_features(url):
    parsed_url = urlparse(url)

    domain = parsed_url.netloc
    domain_parts = domain.split('.')
    domain_length = len(domain)
    num_subdomains = len(domain_parts) - 2 if len(domain_parts) > 2 else 0

    path = parsed_url.path
    path_length = len(path)
    path_segments = path.split('/')
    num_path_segments = len(path_segments) - 1 if path != '/' else 0
    has_num_dash = any(char.isdigit() for char in parsed_url.path)


    num_dash_in_hostname = any(char.isdigit() for char in parsed_url.hostname if char == '-')
    has_at_symbol = '@' in url
    has_tilde_symbol = '~' in url
    num_underscore = url.count('_')
    num_percent = url.count('%')
    num_hash = url.count('#')
    num_numeric_chars = sum(c.isdigit() for c in url)
    query_components = parse_qs(parsed_url.query)
    num_query_components = len(query_components)
    num_ampersand = 0
    for param in query_components:
        num_ampersand += len(query_components[param]) - 1
    no_https = not parsed_url.scheme == 'https'
    random_string = parsed_url.path
    ip_address = parsed_url.hostname if parsed_url.hostname.replace('.', '').isdigit() else None
    subdomains = parsed_url.hostname.split('.')[:-2] if ip_address is None else None
    path_segments = parsed_url.path.split('/')
    domain_in_paths = None
    for segment in path_segments:
        if segment.count('.') >= 1:
            domain_in_paths = segment.split('.')[0]
            break
    is_https = parsed_url.scheme == 'https'
    hostname_length = len(parsed_url.hostname)

    return {
        'Domain': domain,
        'DomainLength': domain_length,
        'NumSubdomains': num_subdomains,
        'Path': path,
        'PathLength': path_length,
        'NumPathSegments': num_path_segments,
        'NumDash': has_num_dash,
        'NumDashInHostname': num_dash_in_hostname,
        'AtSymbol': has_at_symbol,
        'TildeSymbol': has_tilde_symbol,
        'NumUnderscore': num_underscore,
        'NumPercent': num_percent,
        'NumQueryComponents': num_query_components,
        'NumAmpersand': num_ampersand,
        'NumHash': num_hash,
        'NumNumericChars': num_numeric_chars,
        'NoHttps': no_https,
        'RandomString': random_string,
        'IpAddress': ip_address,
        'DomainInSubdomains': subdomains,
        'DomainInPaths': domain_in_paths,
        'HttpsInHostname': is_https,
        'HostnameLength': hostname_length,
    }

extracted_features = []
for url in urls:
    features = extract_features(url)
    extracted_features.append(features)

In [55]:
df = pd.DataFrame(extracted_features)

In [56]:
print(df)

              Domain  DomainLength  NumSubdomains Path  PathLength  \
0     www.nitk.ac.in            14              2    /           1   
1   www.meity.gov.in            16              2    /           1   
2  www.srmist.edu.in            17              2    /           1   

   NumPathSegments  NumDash  NumDashInHostname  AtSymbol  TildeSymbol  ...  \
0                0    False              False     False        False  ...   
1                0    False              False     False        False  ...   
2                0    False              False     False        False  ...   

   NumAmpersand  NumHash  NumNumericChars  NoHttps  RandomString  IpAddress  \
0             0        0                0    False             /       None   
1             0        0                0    False             /       None   
2             0        0                0    False             /       None   

   DomainInSubdomains DomainInPaths HttpsInHostname HostnameLength  
0         [www, nitk

In [57]:
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['Domain','DomainLength','NumSubdomains', 'Path', 'PathLength', 'NumPathSegments', 'NumDash','NumDashInHostname','AtSymbol',
                      'TildeSymbol','NumUnderscore','NumPercent','NumQueryComponents', 'NumAmpersand', 'NumHash','NumNumericChars','NoHttps',
                      'RandomString','IpAddress', 'DomainInSubdomains','DomainInPaths', 'HttpsInHostname','HostnameLength']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [58]:
csv_filename = 'url_features.csv'
save_to_csv(extracted_features, csv_filename)
print(f"Features saved to '{csv_filename}'")
files.download(csv_filename)

Features saved to 'url_features.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>