Training ML models to check if a URLs for Phishing

In [18]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
import math
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('phishing_site_urls.csv')
print (len(df))
df.Label.value_counts()

549346


good    392924
bad     156422
Name: Label, dtype: int64

In [3]:
Label = LabelEncoder()
df['Label'] = Label.fit_transform(df['Label'])
df.head(10)

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0
3,mail.printakid.com/www.online.americanexpress....,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,0
5,smilesvoegol.servebbs.org/voegol.php,0
6,premierpaymentprocessing.com/includes/boleto-2...,0
7,myxxxcollection.com/v1/js/jih321/bpd.com.do/do...,0
8,super1000.info/docs,0
9,horizonsgallery.com/js/bin/ssl1/_id/www.paypal...,0


In [4]:
df['url_length'] = df['URL'].apply(len)
df['num_dots'] = df['URL'].apply(lambda x: x.count('.'))
df['num_slash'] = df['URL'].apply(lambda x: x.count('/'))
df['num_redir'] = df['URL'].apply(lambda x: x.count('//'))
df['num_dash'] = df['URL'].apply(lambda x: x.count('-'))
df['contains_anchor'] = df['URL'].str.contains('#')
df['has_https'] = df['URL'].str.contains("https")
df.head()

Unnamed: 0,URL,Label,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,6,10,0,4,False,False
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,5,4,0,2,False,False
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,7,11,0,1,False,False
3,mail.printakid.com/www.online.americanexpress....,0,60,6,2,0,0,False,False
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,1,10,1,1,False,False


In [5]:
sorted_df = df.sort_values(by='has_https', ascending=False)
sorted_df.head()

Unnamed: 0,URL,Label,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https
135630,solversa.com/components/https/dropbox/login/,0,44,1,5,0,0,False,True
25071,'us.battle.met.woribofwaroraft.com/login/en/in...,0,139,8,3,0,0,False,True
16490,'us.battlle.net.htm.bwfx.info/battle_net_accou...,0,147,9,1,0,0,False,True
121659,beauty-plus.co.uk/cli/https:/atendimento/chama...,0,541,3,4,0,1,False,True
121655,beauty-plus.co.uk/cli/https:/atendimento,0,40,2,3,0,1,False,True


In [6]:
def contains_unicode(url):
    for char in url:
        if ord(char) > 127:
            return True
    return False

df['contains_unicode'] = df['URL'].apply(contains_unicode)
df.head()

Unnamed: 0,URL,Label,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,6,10,0,4,False,False,False
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,5,4,0,2,False,False,False
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,7,11,0,1,False,False,False
3,mail.printakid.com/www.online.americanexpress....,0,60,6,2,0,0,False,False,False
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,1,10,1,1,False,False,False


In [7]:
input = df.drop(['URL', 'Label'], axis='columns')
target = df.Label
print(target.head())
input.head(10)

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int32


Unnamed: 0,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,225,6,10,0,4,False,False,False
1,81,5,4,0,2,False,False,False
2,177,7,11,0,1,False,False,False
3,60,6,2,0,0,False,False,False
4,116,1,10,1,1,False,False,False
5,36,3,1,0,0,False,False,False
6,61,2,2,0,3,False,False,False
7,60,5,6,0,0,False,False,False
8,19,1,1,0,0,False,False,False
9,193,4,10,0,3,False,False,False


In [8]:
len (input)

549346

In [9]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.4)
print (len (X_test))
print (len (X_train))

219739
329607


Decision Tree

In [10]:
model = DecisionTreeClassifier()
model.fit (X_train, y_train)

In [11]:
model.score(X_test, y_test)

0.7890679396920892

Random Forests

In [12]:
model = RandomForestClassifier(n_estimators=30)
model.fit (X_train, y_train)

In [13]:
model.score(X_test, y_test)

0.790637984153928

In [14]:
y_pred = model.predict (X_test)
print (classification_report (y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.36      0.50     62530
           1       0.79      0.96      0.87    157209

    accuracy                           0.79    219739
   macro avg       0.79      0.66      0.68    219739
weighted avg       0.79      0.79      0.76    219739



Support Vector Machine

In [15]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.1)

In [34]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [35]:
y_pred = knn.predict(X_test)

In [36]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.76
