In [4]:
import os
from dotenv import load_dotenv

In [5]:
load_dotenv()

True

In [6]:
path_to_csv_file = os.getenv("PATH_TO_DATASET")

if not os.path.isfile(path_to_csv_file):
    print('not a file')

In [9]:
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns 

In [10]:
df = pd.read_csv(path_to_csv_file)
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [12]:
total_urls = len(df)
print(f'Number of total urls: {total_urls}')

number_of_types = df['type'].nunique()
print(f'Number of unique types: {number_of_types}')

types_count = df['type'].value_counts()
print(f'Type counts: {types_count}')

type_percentage = df['type'].value_counts(normalize=True) * 100
print(type_percentage.map('{:.2f}'.format))

Number of total urls: 651191
Number of unique types: 4
Type counts: type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64
type
benign        65.74
defacement    14.81
phishing      14.45
malware        4.99
Name: proportion, dtype: object


In [24]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"

In [28]:
fig_bar = px.histogram(
    df,
    x='type',
    title='Number of URLs for each type',
    labels={'type': 'URL Type', 'count': 'Number of URLs'},
    color='type',
    color_discrete_sequence=px.colors.qualitative.Bold
)
fig_bar.update_layout(
    xaxis_title='URL Type', 
    yaxis_title='Number of URLs',
    bargap=0.2
)

fig_bar.show()

In [33]:
df_phishing = df[df.type=='phishing']
df_phishing

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
21,signin.eby.de.zukruygxctzmmqi.civpro.co.za,phishing
28,http://www.marketingbyinternet.com/mo/e56508df...,phishing
40,https://docs.google.com/spreadsheet/viewform?f...,phishing
72,retajconsultancy.com,phishing
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [35]:
df_malware = df[df.type=='malware']
df_defacement = df[df.type=='defacement']
df_benign = df[df.type=='benign']