In [3]:
import pandas as pd
from urllib.parse import urlsplit
from urllib.parse import urlparse
import re

In [5]:
# Reading Data

df = pd.read_csv("phishing_site_urls.csv")

In [6]:
# Checking first 5 rows

df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [7]:
# Number of rows and columns

df.shape

(549346, 2)

In [8]:
# Describe the dataset

df.describe()

Unnamed: 0,URL,Label
count,549346,549346
unique,507195,2
top,jhomitevd2abj3fk.tor2web.org/,good
freq,52,392924


In [9]:
# Checking datatypes of the columns

df.dtypes

URL      object
Label    object
dtype: object

In [10]:
# Columns in the dataset

df.columns

Index(['URL', 'Label'], dtype='object')

# Extracting data from the URL

In [11]:
df.columns

Index(['URL', 'Label'], dtype='object')

In [12]:
# Extracting domain name from the URL
def extract_domain(url):
    match = re.search("^[^/]+", url)
    return match.group() if match else None

In [13]:
# Extracting path from the URL

def extract_path(url):
    match = re.search("/[^?]+", url)
    return match.group() if match else None

In [14]:
# Extracting query string from the URL

def extract_query_string(url):
    match = re.search("\?[^#]+", url)
    return match.group() if match else None

In [15]:
# Extracting fragment from the URL

def extract_fragment(url):
    match = re.search("#[^/]+", url)
    return match.group()[1:] if match else None

In [16]:
# Creating domain name column

df["domain_name"] = df["URL"].apply(lambda x: extract_domain(x))

In [17]:
# Creating path column

df["path"] = df["URL"].apply(lambda x: extract_path(x))

In [18]:
# Creating query string column

df["query_string"] = df["URL"].apply(lambda x: extract_query_string(x))

In [19]:
# Creating fragment column

df["fragment"] = df["URL"].apply(lambda x: extract_fragment(x))

In [20]:
df.isnull().sum()

URL                  0
Label                0
domain_name          1
path             62960
query_string    467489
fragment        548838
dtype: int64

In [24]:
frag_null = (df["fragment"].isnull().sum()/df.shape[0])*100

In [25]:
domain_null = (df["domain_name"].isnull().sum()/df.shape[0])*100

In [26]:
path_null = (df["path"].isnull().sum()/df.shape[0])*100

In [27]:
query_string_null = (df["query_string"].isnull().sum()/df.shape[0])*100

In [28]:
df.shape

(549346, 6)

# Extracting other features from the URL

'qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url', 'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url', 'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url', 'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url', 'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url'

In [21]:
def count_dots(url):
    return url.count(".")

url = "https://plu.mx/plum/a?mendeley_data_id=72ptz43s9v&theme=plum-bigben-theme"
dot_count = count_dots(url)
print("Number of dots in the URL:", dot_count)

Number of dots in the URL: 1


In [31]:
def count_hyphen(url):
    return url.count("-")

hyphen_count = count_hyphen(url)
print("Number of hyphen in the URL:", hyphen_count)

Number of dots in the URL: 2


In [40]:
def count_underline(url):
    return url.count("_")

underline_count = count_underline(url)
print("Number of underline in the URL:", underline_count)

Number of dots in the URL: 2


In [42]:
def count_slash(url):
    return url.count("/")

slash_count = count_slash(url)
print("Number of slash in the URL:", slash_count)

Number of slash in the URL: 4


In [44]:
df["qty_dot_url"] = df["URL"].apply(lambda x: count_dots(x))

In [45]:
df["qty_dot_url"]

0         6
1         5
2         7
3         6
4         1
         ..
549341    3
549342    1
549343    1
549344    1
549345    1
Name: qty_dot_url, Length: 549346, dtype: int64

In [46]:
df["URL"][0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'