In [1]:
import pandas as pd
from urllib.parse import urlsplit
from urllib.parse import urlparse
import re

In [2]:
# Reading Data

df = pd.read_csv("phishing_site_urls.csv")

In [3]:
# Checking first 5 rows

df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [4]:
# Number of rows and columns

df.shape

(549346, 2)

In [5]:
# Describe the dataset

df.describe()

Unnamed: 0,URL,Label
count,549346,549346
unique,507195,2
top,jhomitevd2abj3fk.tor2web.org/,good
freq,52,392924


In [6]:
# Checking datatypes of the columns

df.dtypes

URL      object
Label    object
dtype: object

In [7]:
# Columns in the dataset

df.columns

Index(['URL', 'Label'], dtype='object')

# Extracting data from the URL

In [8]:
df.columns

Index(['URL', 'Label'], dtype='object')

In [9]:
# Extracting domain name from the URL
def extract_domain(url):
    match = re.search("^[^/]+", url)
    return match.group() if match else None

In [10]:
# Extracting path from the URL

def extract_path(url):
    match = re.search("/[^?]+", url)
    return match.group() if match else None

In [11]:
# Extracting query string from the URL

def extract_query_string(url):
    match = re.search("\?[^#]+", url)
    return match.group() if match else None

In [12]:
# Extracting fragment from the URL

def extract_fragment(url):
    match = re.search("#[^/]+", url)
    return match.group()[1:] if match else None

In [13]:
# Creating domain name column

df["domain_name"] = df["URL"].apply(lambda x: extract_domain(x))

In [14]:
# Creating path column

df["path"] = df["URL"].apply(lambda x: extract_path(x))

In [15]:
# Creating query string column

df["query_string"] = df["URL"].apply(lambda x: extract_query_string(x))

In [16]:
# Creating fragment column

df["fragment"] = df["URL"].apply(lambda x: extract_fragment(x))

In [17]:
df.isnull().sum()

URL                  0
Label                0
domain_name          1
path             62960
query_string    467489
fragment        548838
dtype: int64

In [None]:
def count_dots(url):
    return url.count(".")

url = "https://plu.mx/plum/a?mendeley_data_id=72ptz43s9v&theme=plum-bigben-theme"
dot_count = count_dots(url)
print("Number of dots in the URL:", dot_count)

In [18]:
def count_slash(url):
    return url.count("/")

In [19]:
slash_count = count_slash(url)
print("Number of dots in the URL:", slash_count)

NameError: name 'url' is not defined

In [20]:
df.head()

Unnamed: 0,URL,Label,domain_name,path,query_string,fragment
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,nobell.it,/70ffb52d079109dca5664cce6f317373782/login.Sky...,?cmd=_profile-ach&outdated_page_tmpl=p/gen/fai...,
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,www.dghjdgf.com,/paypal.co.uk/cycgi-bin/webscrcmd=_home-custom...,,
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,serviciosbys.com,/paypal.cgi.bin.get-into.herf.secure.dispatch3...,,
3,mail.printakid.com/www.online.americanexpress....,bad,mail.printakid.com,/www.online.americanexpress.com/index.html,,
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,thewhiskeydregs.com,/wp-content/themes/widescreen/includes/temp/pr...,?84784787824HDJNDJDSJSHD//2724782784/,


In [21]:
frag_null = (df["fragment"].isnull().sum()/df.shape[0])*100

In [22]:
frag_null

99.90752640412418

In [24]:
(df["domain_name"].isnull().sum()/df.shape[0])*100

0.00018203463755083317

In [25]:
(df["path"].isnull().sum()/df.shape[0])*100

11.460900780200456

In [26]:
(df["query_string"].isnull().sum()/df.shape[0])*100

85.09919067400145

In [27]:
df.shape

(549346, 6)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   URL           549346 non-null  object
 1   Label         549346 non-null  object
 2   domain_name   549345 non-null  object
 3   path          486386 non-null  object
 4   query_string  81857 non-null   object
 5   fragment      508 non-null     object
dtypes: object(6)
memory usage: 25.1+ MB


In [30]:
df.describe()

Unnamed: 0,URL,Label,domain_name,path,query_string,fragment
count,549346,549346,549345,486386,81857,508
unique,507195,2,190302,366823,64498,257
top,jhomitevd2abj3fk.tor2web.org/,good,en.wikipedia.org,/watch,?m=login,n=1252899642&fid=1&fav=1
freq,52,392924,13206,8552,1157,187
