<a href="https://colab.research.google.com/github/AbhishekM100/Mastering-Machine-Learning-for-Penetration-Testing/blob/master/Detecting_Phishing_URL_by_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import random

#Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [0]:
# Load URL Data
data = pd.read_csv("data.csv")

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
url      420464 non-null object
label    420464 non-null object
dtypes: object(2)
memory usage: 6.4+ MB


In [51]:
data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


**Data Tokenizer**

In [0]:
def makeTokens(f):
  tkns_BySlash = str(f.encode('utf-8')).split('/')  #Make tokens after splitting by slash
  total_tokens = []
  for i in tkns_BySlash:
    tokens = str(i).split('-')                      #Splitting by dash
    tkns_ByDot = []
    for j in range(0, len(tokens)):
      temp_tokens = str(tokens[j]).split('.')       #Splitting by dot
    total_tokens = total_tokens + tokens + temp_tokens
  total_tokens = list(set(total_tokens))            #Removing redundencies
  if 'com' in total_tokens:
    total_tokens.remove('com')                      #Removing 'com'
  return total_tokens

In [0]:
#Labels
y = data['label']

In [54]:
y

0         bad
1         bad
2         bad
3         bad
4         bad
5         bad
6         bad
7         bad
8         bad
9         bad
10        bad
11        bad
12        bad
13        bad
14        bad
15        bad
16        bad
17        bad
18        bad
19        bad
20        bad
21        bad
22        bad
23        bad
24        bad
25        bad
26        bad
27        bad
28        bad
29        bad
         ... 
420434    bad
420435    bad
420436    bad
420437    bad
420438    bad
420439    bad
420440    bad
420441    bad
420442    bad
420443    bad
420444    bad
420445    bad
420446    bad
420447    bad
420448    bad
420449    bad
420450    bad
420451    bad
420452    bad
420453    bad
420454    bad
420455    bad
420456    bad
420457    bad
420458    bad
420459    bad
420460    bad
420461    bad
420462    bad
420463    bad
Name: label, Length: 420464, dtype: object

In [0]:
#Features
url_list = data['url']

In [56]:
url_list

0                                    diaryofagameaddict.com
1                                          espdesign.com.au
2                                        iamagameaddict.com
3                                             kalantzis.net
4                                     slightlyoffcenter.net
5                                          toddscarwash.com
6                                            tubemoviez.com
7                                                    ipl.hk
8               crackspider.us/toolbar/install.php?pack=exe
9                                           pos-kupang.com/
10                                               rupor.info
11        svision-online.de/mgfi/administrator/component...
12        officeon.ch.ma/office.js?google_ad_format=728x...
13                                              sn-gzzx.com
14                            sunlux.net/company/about.html
15                                              outporn.com
16                                   tim

In [0]:
#Using Defualt Tokenizer
#vectorizer = TfidfVectorizer()

#Using Custom Tokenizer

vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [0]:
X = vectorizer.fit_transform(url_list)

In [59]:
X

<420464x715384 sparse matrix of type '<class 'numpy.float64'>'
	with 2945002 stored elements in Compressed Sparse Row format>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [61]:
# Model Building
# Using Logistic Regression

logit = LogisticRegression()
logit.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
print('Accuracy = ', logit.score(X_test, y_test))

Accuracy =  0.9613760955133007


**Predicting With Model**

In [0]:
X_predict = ["google.com/search=jcharistech",
"google.com/search=faizanahmad",
"pakistanifacebookforever.com/getpassword.php/", 
"www.radsport-voggel.de/wp-admin/includes/log.exe", 
"ahrenhei.without-transfer.ru/nethost.exe ",
"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]

In [0]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)

In [65]:
print(New_predict)

['good' 'good' 'good' 'bad' 'bad' 'bad']
