# Downloading and importing the dataset

In [None]:
# Dataset DL
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
# Phishing Websites Data Set as shown in the project assignment
phiusiil_phishing_url_website = fetch_ucirepo(id=967)

# split x/y data for further use
X = phiusiil_phishing_url_website.data.features
y = phiusiil_phishing_url_website.data.targets


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


## Printing the standard metadata/variable info and the dataset to see if imported right

In [None]:

# metadata
print(phiusiil_phishing_url_website.metadata)

# variable information
print(phiusiil_phishing_url_website.variables)


{'uci_id': 967, 'name': 'PhiUSIIL Phishing URL (Website)', 'repository_url': 'https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/967/data.csv', 'abstract': 'PhiUSIIL Phishing URL Dataset is a substantial dataset comprising 134,850 legitimate and 100,945 phishing URLs. Most of the URLs we analyzed, while constructing the dataset, are the latest URLs. Features are extracted from the source code of the webpage and URL. Features such as CharContinuationRate, URLTitleMatchScore, URLCharProb, and TLDLegitimateProb are derived from existing features.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 235795, 'num_features': 54, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['label'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2024, 'last_updated': 'Sun May 12 

##  Showing the database 

In [None]:
# Show X, just to make sure import looks good
X

Unnamed: 0,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,...,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef
0,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.000000,1.000000,0.522907,0.061933,...,1,0,0,1,34,20,28,119,0,124
1,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.000000,0.666667,0.032650,0.050207,...,0,0,0,1,50,9,8,39,0,217
2,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.000000,0.866667,0.028555,0.064129,...,0,0,0,1,10,2,7,42,2,5
3,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.000000,1.000000,0.522907,0.057606,...,0,1,1,1,3,27,15,22,1,31
4,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.000000,1.000000,0.079963,0.059441,...,1,1,0,1,244,15,34,72,1,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,https://www.skincareliving.com,29,www.skincareliving.com,22,0,com,100.000000,1.000000,0.522907,0.058739,...,0,1,0,1,51,7,21,187,2,191
235791,https://www.winchester.gov.uk,28,www.winchester.gov.uk,21,0,uk,100.000000,0.785714,0.028555,0.053834,...,0,1,0,0,50,1,7,88,0,31
235792,https://www.nononsensedesign.be,30,www.nononsensedesign.be,23,0,be,100.000000,1.000000,0.003319,0.063093,...,0,0,0,1,27,10,30,58,2,67
235793,https://patient-cell-40f5.updatedlogmylogin.wo...,55,patient-cell-40f5.updatedlogmylogin.workers.dev,47,0,dev,28.157537,0.465116,0.000961,0.050211,...,0,0,0,0,0,0,3,0,0,0


# Inspecting the dataset


## Checking N/A's 


In [None]:

# Show N/A counts in X and y, in case we need to clean them off
import pandas as pd

na_counts_x = X.isna().sum()
print("N/A counts in X:\n", na_counts_x)

na_counts_y = y.isna().sum()
print("\nN/A counts in y:\n", na_counts_y)


N/A counts in X:
 URL                           0
URLLength                     0
Domain                        0
DomainLength                  0
IsDomainIP                    0
TLD                           0
URLSimilarityIndex            0
CharContinuationRate          0
TLDLegitimateProb             0
URLCharProb                   0
TLDLength                     0
NoOfSubDomain                 0
HasObfuscation                0
NoOfObfuscatedChar            0
ObfuscationRatio              0
NoOfLettersInURL              0
LetterRatioInURL              0
NoOfDegitsInURL               0
DegitRatioInURL               0
NoOfEqualsInURL               0
NoOfQMarkInURL                0
NoOfAmpersandInURL            0
NoOfOtherSpecialCharsInURL    0
SpacialCharRatioInURL         0
IsHTTPS                       0
LineOfCode                    0
LargestLineLength             0
HasTitle                      0
Title                         0
DomainTitleMatchScore         0
URLTitleMatchScore    

## Checking for anomalies (<3 stdevs above/below mean) and possible relations

In [None]:
# No N/A values in X and y, so no need to clean them off
# Let's check for anomalies in the data, and check which lasses they relate to
import numpy as np

for col in X.columns:
    # Is it numeric?
    if pd.api.types.is_numeric_dtype(X[col]):
        # If so, calculate if its 3 stdevs above/below the mean
        mean = np.mean(X[col])
        std = np.std(X[col])
        threshold = 3
        anomalies = X[(np.abs(X[col] - mean) > threshold * std)]
        # If there is:
        if not anomalies.empty:
            print(f"Anomalous values in column '{col}':")
            print(y[(np.abs(X[col] - mean) > threshold * std)])
            # How many 0/1 are there?
            label_counts = y[(np.abs(X[col] - mean) > threshold * std)].value_counts()
            print(label_counts)



Anomalous values in column 'URLLength':
        label
401         0
458         0
939         0
1093        0
1205        0
...       ...
234393      0
234994      0
235468      0
235634      0
235722      0

[2198 rows x 1 columns]
label
0        2198
Name: count, dtype: int64
Anomalous values in column 'DomainLength':
        label
259         0
351         0
378         0
513         0
600         0
...       ...
235268      0
235403      0
235425      0
235487      0
235621      0

[3826 rows x 1 columns]
label
0        3825
1           1
Name: count, dtype: int64
Anomalous values in column 'IsDomainIP':
        label
125         0
401         0
1205        0
1362        0
2481        0
...       ...
230582      0
233569      0
233640      0
233868      0
233955      0

[638 rows x 1 columns]
label
0        638
Name: count, dtype: int64
Anomalous values in column 'CharContinuationRate':
        label
810         0
1125        0
1910        0
3150        0
4535        0
...       ..

## Checking string column formats

In [None]:
# Litle anomalies and no clear patterns for most of the big ones, so we can keep the data as is
# Let's check for string columns and see if they need to be handled
for col in X.columns:
    # Is it numeric?
    if pd.api.types.is_string_dtype(X[col]):
        print(f"Column: {col}")
        print(X[col].head(50))
        print("-" * 20)

Column: URL
0                      https://www.southbankmosaics.com
1                              https://www.uni-mainz.de
2                        https://www.voicefmradio.co.uk
3                           https://www.sfnmjournal.com
4                    https://www.rewildingargentina.org
5                       https://www.globalreporting.org
6                            https://www.saffronart.com
7                            https://www.nerdscandy.com
8                        https://www.hyderabadonline.in
9                                   https://www.aap.org
10                   https://www.religionenlibertad.com
11                              http://www.teramill.com
12                         https://www.socialpolicy.org
13                                https://www.aoh61.com
14                          https://www.bulgariaski.com
15                            https://www.brightika.com
16                                https://www.motley.ie
17                               htt

## Checking URL validity (since its easily verifiable via regex)

In [None]:
# URL can be easily checked for validity using regex, so lets do that
import re

def is_valid_url(url):
    # Create regex
    regex = re.compile(
        # Some conditions, first https/related, thrn domain, then the optional paths
        r'^(?:http|ftp)s?://'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(regex, url) is not None

if 'URL' in X.columns:
    for url in X["URL"]:
        # Show all invalid URLs
        if is_valid_url(url):
          pass
        else:
            print(f"'{url}' is NOT a valid URL.")

'http://www.paypal.com:911@xn--80aaahorunoimsf3p.xn--p1ai/fix-account/' is NOT a valid URL.
'http://1975.canadawaterfilters.ca#example@example.com' is NOT a valid URL.
'http://netflixaccount.node:911@env-2949886.node.cloudlets.zone/' is NOT a valid URL.
'https://vystar..0rg@client.iebetanialaargentina.edu.co' is NOT a valid URL.
'http://account.venmo.com.sign-in:911@xn--80aaahorunoimsf3p.xn--p1ai/v/' is NOT a valid URL.
'https://support.ionos.de@safebikehelmet.com/n#redacted@abuse.ionos.com' is NOT a valid URL.
'https://mail.lonos.com:authentification-release@dheed.com/media' is NOT a valid URL.
'http://www.accntnetflix.com:911@s954760881.onlinehome.us/aq' is NOT a valid URL.
'http://4433.canadawaterfilters.ca#example@example.com' is NOT a valid URL.
'https://valoresareceber@aux1.ru/go/' is NOT a valid URL.
'http://restart-membership.com:911@s955723798.onlinehome.us/ven' is NOT a valid URL.
'http://danaxtell.perroandante.cl#danaxtell@danaxtell.com' is NOT a valid URL.
'http://exu0wgk02

# Conclusion


We were able to download the repo and do some inital data cleaning on it. The data is already well cleaned as a result of it being used in research before publishing, which is a good thing.

Regardless, we noted some anomalies above, but it is not as great as needed and theres not a clear pattern in the columns where it might be needed, so we left it as it is.

Feature extraction is not really required, as the dataset integrated some including the URL, so unless needed we won't do it right now.

We tested URLS for validity since its easy, we found some supposedly in valid ones with @ in the middle, but if you highlight on them you realise that it leads only to the other half after the @, which is a phishing technique that needs to stay.