# Datasets available


## 1. Kaggle  
https://www.kaggle.com/datasets/harisudhan411/phishing-and-legitimate-urls/data

In [1]:
import pandas as pd
data0 = pd.read_csv("data/1.URL-Kaggle-800k.csv")
data0.shape

(822010, 2)

In [2]:
data0.head()

Unnamed: 0,url,status
0,0000111servicehelpdesk.godaddysites.com,0
1,000011accesswebform.godaddysites.com,0
2,00003.online,0
3,0009servicedeskowa.godaddysites.com,0
4,000n38p.wcomhost.com,0


## 2. Hugface - bgspaditya  
https://huggingface.co/datasets/bgspaditya/phishing-dataset

In [2]:
data1 = pd.read_csv("data/2.hf-aditya.csv")
data1.shape

(651191, 2)

In [4]:
data1.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


### 1. Merging this dataset with previous one

In [3]:
# merging Dataset0 and Dataset1
merge01 = pd.merge(data0, data1, left_on = ["url"], right_on = ["url"], how = "outer")
merge01 = merge01.reset_index()
merge01.head()

Unnamed: 0,index,url,status,type
0,0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...,1.0,phishing
1,1,@ÒÊ\t¹Ë¨öí,1.0,phishing
2,2,HÖË]t¹[ÈöýE,1.0,phishing
3,3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,1.0,phishing
4,4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...,1.0,phishing


In [23]:
#What are all these random symbols, surely they are not actual urls.
merge01.shape
#This shows there are 602 new records that are not supposed to be there

(1029017, 3)

In [39]:
#pd.merge(merge01, data0, on=['url'], how = "inner").shape
test = pd.merge(merge01, data1, on=['url'], how = "inner")



(887638, 5)

Issues with merging:  
1. Duplicate urls. 
2. Urls with contradicting labels. i.e 1 and benign. or 0 and phishing
3. data1 has values other than phishing and benign. - solved
4. urls 

### data1 has values other than phishing and benign - solved

In [71]:
#debugging
merge01["type"] == "defacement"

0          False
1          False
2          False
3          False
4          False
           ...  
1029012    False
1029013    False
1029014    False
1029015    False
1029016    False
Name: type, Length: 1029017, dtype: bool

In [4]:
# Removing rows with type defacement, as that is not what we need
merge01 = merge01.drop(merge01[(merge01["status"].isna() ) & (merge01["type"] == "defacement")].index) 

In [5]:
# label malware data as phishing data 
merge01 = merge01.replace("malware", value="phishing")
merge01["type"].unique()

array(['phishing', nan, 'benign'], dtype=object)

### creating label column - issue unsolved

In [8]:
# renaming columns
merge01.columns = ["index", "url", "A", "B"]
merge01.head()


Unnamed: 0,index,url,A,B
0,0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...,1.0,phishing
1,1,@ÒÊ\t¹Ë¨öí,1.0,phishing
2,2,HÖË]t¹[ÈöýE,1.0,phishing
3,3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,1.0,phishing
4,4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...,1.0,phishing


In [12]:
# number of rows with contradicting labels
n_error = sum((merge01["A"] == 1) & (merge01["B"] == "phishing")) + sum((merge01["A"] == 0) & (merge01["B"] == "benign"))
n_total = len(merge01["index"])
per_error = n_error/n_total
print(per_error*100)

10.293922106888564


### Duplicates

In [109]:
merge01.nunique()

index    932560
url      909668
A             2
B             2
dtype: int64

In [13]:
merge01 = merge01.drop_duplicates("url")
merge01.nunique()

index    909668
url      909668
A             2
B             2
dtype: int64

In [100]:
data0.shape

(822010, 2)

In [102]:
data0.nunique()

url       808042
status         2
dtype: int64

In [101]:
data0.drop_duplicates().shape

(808042, 2)

In [103]:
data1.shape

(651191, 2)

In [104]:
data1.nunique()

url     641119
type         4
dtype: int64

In [105]:
data1.drop_duplicates().shape

(641125, 2)

## *MERG01* now has no duplicates, and each row contains a label from each dataset, A and B.

### Create new label "A" column switching labels to 0 for benign, 1 for phishing

In [133]:
merge01.head()

Unnamed: 0,index,url,B,A
0,0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...,phishing,1.0
1,1,@ÒÊ\t¹Ë¨öí,phishing,1.0
2,2,HÖË]t¹[ÈöýE,phishing,1.0
3,3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,phishing,1.0
4,4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...,phishing,1.0


In [14]:
# Creating a new column with switched values
import numpy as np
merge01["A"] = merge01["A"].map({1:0 , 0:1})
merge01.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merge01["A"] = merge01["A"].map({1:0 , 0:1})


Unnamed: 0,index,url,A,B
0,0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...,0.0,phishing
1,1,@ÒÊ\t¹Ë¨öí,0.0,phishing
2,2,HÖË]t¹[ÈöýE,0.0,phishing
3,3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,0.0,phishing
4,4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...,0.0,phishing


In [15]:
# making a new copy so i don't make dumb mistakes like i just did right now
mergeAB = merge01.copy(deep=True)

In [21]:
mergeAB.columns = ["index", "url", "label_A", "label_B"]
mergeAB.head()

Unnamed: 0,index,url,label_A,label_B
0,0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...,0.0,phishing
1,1,@ÒÊ\t¹Ë¨öí,0.0,phishing
2,2,HÖË]t¹[ÈöýE,0.0,phishing
3,3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,0.0,phishing
4,4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...,0.0,phishing


### Split into "contradicting data" & labelled data  
1. 0 and Phish & 1 and benign

#### Get and save unlabelled data

In [22]:
labelled_url1 = mergeAB[((mergeAB["label_A"] == 1) & (mergeAB["label_B"] == "phishing")) | ((mergeAB["label_A"] == 0) & (mergeAB["label_B"] == "benign"))]
labelled_url2 = mergeAB[(mergeAB["label_A"].isna()) | (mergeAB["label_B"].isna())]
labelled_url = pd.merge(labelled_url1, labelled_url2, how = "outer")
labelled_url.shape

(813679, 4)

In [23]:
unlabelled_url = mergeAB[ (mergeAB["label_B"] == "phishing") & (mergeAB["label_A"] == 0) | (mergeAB["label_B"] == "benign") & (mergeAB["label_A"] == 1)]

In [24]:
unlabelled_url = unlabelled_url.drop(columns= ["label_A", "label_B"])

In [None]:
unlabelled_url = unlabelled_url.reset_index()
unlabelled_url.to_csv('URL_to_test.csv')

In [33]:
unlabelled_url.to_csv("URL_to_test.csv")
unlabelled_url.head()

Unnamed: 0,url
0,\t¯7Lø.á>V>4z¢¶Mù2<¯LOâ¿31È/µ$*)Då©WÄcg...
1,@ÒÊ\t¹Ë¨öí
2,HÖË]t¹[ÈöýE
3,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í
4,^oð]Â|¬|hõElòdy^Å~fb_jH0TRËR¯ÆÏa...


#### Get labeleld data and clean up

In [35]:
print("Labelled url:", labelled_url.shape)
print("Unlabelled_url: ", unlabelled_url.shape)
print("mergeAB: ", mergeAB.shape)

Labelled url: (813679, 4)
Unlabelled_url:  (95989, 1)
mergeAB:  (909668, 4)


In [36]:
labelled_url.head()

Unnamed: 0,index,url,label_A,label_B
0,10,69.162.100.198/,1.0,
1,11,babicz123.ddns.net/,1.0,
2,12,highpowerresources.com,1.0,
3,13,intent.nofrillspace.com/users/web11_focus/380...,1.0,
4,14,intent.nofrillspace.com/users/web11_focus/430...,1.0,


In [37]:
labelled_url["label_B"] = labelled_url["label_B"].map({"phishing":1 , "benign":0})
labelled_url.head()

Unnamed: 0,index,url,label_A,label_B
0,10,69.162.100.198/,1.0,
1,11,babicz123.ddns.net/,1.0,
2,12,highpowerresources.com,1.0,
3,13,intent.nofrillspace.com/users/web11_focus/380...,1.0,
4,14,intent.nofrillspace.com/users/web11_focus/430...,1.0,


In [38]:
labelled_url.reset_index(inplace=True)

In [41]:
labelled_url = labelled_url.drop(columns = ["index"])

In [96]:
labelled_url.loc[labelled_url['level_0'] == 0]

Unnamed: 0,level_0,url,label_A,label_B
0,0,69.162.100.198/,1.0,


In [112]:
def labelling(row):
    if (row.label_A == 1) | (row.label_A == 0):
        return row.label_A
    else:
        return row.label_B
    
def labelling1(row):
    return row["label_A"]


labelled_url["label"] = labelled_url.apply(labelling, axis =1)

In [113]:
labelled_url = labelled_url.drop(columns=["label_A", "label_B"])
print(labelled_url.shape)

(813679, 3)


Unnamed: 0,level_0,url,label
0,0,69.162.100.198/,1.0
1,1,babicz123.ddns.net/,1.0
2,2,highpowerresources.com,1.0
3,3,intent.nofrillspace.com/users/web11_focus/380...,1.0
4,4,intent.nofrillspace.com/users/web11_focus/430...,1.0


In [100]:
sum(labelled_url["label_A"].isna())

101626

In [75]:
check = labelled_url[["label_A", "label_B"]].isna()
check[  (check["label_B"] == True)]

Unnamed: 0,label_A,label_B


In [101]:
labelled_url

Unnamed: 0,level_0,url,label_A,label_B
0,0,69.162.100.198/,1.0,
1,1,babicz123.ddns.net/,1.0,
2,2,highpowerresources.com,1.0,
3,3,intent.nofrillspace.com/users/web11_focus/380...,1.0,
4,4,intent.nofrillspace.com/users/web11_focus/430...,1.0,
...,...,...,...,...
813674,813674,zzz.co.uk,1.0,
813675,813675,zzzoolight.co.za,1.0,
813676,813676,zzzoolight.co.za0-i-fdik.000webhostapp.com,1.0,
813677,813677,zzzort10xtest123.com/nin5k3bwo,1.0,


In [115]:
labelled_url = labelled_url.drop( columns= ["level_0"])
labelled_url.to_csv("Phishing_URL_0.csv")