In [1]:
import pandas as pd # use for data manipulation and analysis
import numpy as np # use for multi-dimensional array and matrix

import seaborn as sns # use for high-level interface for drawing attractive and informative statistical graphics 
import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into applications
%matplotlib inline 
# It sets the backend of matplotlib to the 'inline' backend:
import plotly.express as px
import time # calculate time 

from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad

from sklearn.model_selection import train_test_split # spliting the data between feature and target
from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
from sklearn.metrics import confusion_matrix # gives info about actual and predict
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text  
from nltk.stem.snowball import SnowballStemmer # stemmes words
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes  
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos

from PIL import Image # getting images in notebook
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator# creates words colud

from bs4 import BeautifulSoup # use for scraping the data from website
from selenium import webdriver # use for automation chrome 
import networkx as nx # for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.


import pickle# use to dump model 

import warnings # ignores pink warnings 
warnings.filterwarnings('ignore')

In [2]:
phish_data = pd.read_csv('url.csv')

In [3]:
phish_data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [4]:
phish_data.tail()

Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [5]:
phish_data.isnull().sum()

URL      0
Label    0
dtype: int64

In [6]:
#preprocessing
#A tokenizer that splits a string using a regular expression, which matches either the tokens or the separators between tokens.
tokenizer = RegexpTokenizer(r'[A-Za-z]+')#to getting alpha only

In [7]:
phish_data.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [8]:
tokenizer.tokenize(phish_data.URL[0]) # using first row

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [9]:
print('Getting words tokenized ...')
t0= time.perf_counter()
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words tokenized ...
Time taken 1.6393673000000035 sec


In [10]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized
463581,whopopular.com/Bulgaria,good,"[whopopular, com, Bulgaria]"
176722,en.wikipedia.org/wiki/KFOX_(FM),good,"[en, wikipedia, org, wiki, KFOX, FM]"
282840,archive.org/stream/proceedingstrans39roya/proc...,good,"[archive, org, stream, proceedingstrans, roya,..."
260894,wn.com/Loyalist__Belfast__UDA_UFF,good,"[wn, com, Loyalist, Belfast, UDA, UFF]"
547205,web-shuttle.in/eeo9oc,bad,"[web, shuttle, in, eeo, oc]"


In [11]:
#SNOWBALL STEMMER
#Snowball IS  a small string processing language, gives root words
stemmer = SnowballStemmer("english") # choose a language

In [12]:
print('Getting words stemmed ...')
t0= time.perf_counter()
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words stemmed ...
Time taken 26.420369100000016 sec


In [13]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
9168,pastehtml.com/view/bbs1q62t7.html,bad,"[pastehtml, com, view, bbs, q, t, html]","[pastehtml, com, view, bbs, q, t, html]"
505042,natoint.com/,bad,"[natoint, com]","[natoint, com]"
418483,pul.se/search/Ryan%20Tucker,good,"[pul, se, search, Ryan, Tucker]","[pul, se, search, ryan, tucker]"
82827,www.asahi-net.or.jp/~gv9k-setg/,good,"[www, asahi, net, or, jp, gv, k, setg]","[www, asahi, net, or, jp, gv, k, setg]"
289700,be.warnerbros.com/,good,"[be, warnerbros, com]","[be, warnerbro, com]"


In [14]:
print('Getting joiningwords ...')
t0= time.perf_counter()
phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting joiningwords ...
Time taken 0.1617029999999886 sec


In [15]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
543171,avon2you.ru/ayz1waqm,bad,"[avon, you, ru, ayz, waqm]","[avon, you, ru, ayz, waqm]",avon you ru ayz waqm
464174,whosdatedwho.com/tpx_5724/paul-thomas-anderson/,good,"[whosdatedwho, com, tpx, paul, thomas, anderson]","[whosdatedwho, com, tpx, paul, thoma, anderson]",whosdatedwho com tpx paul thoma anderson
301418,celebrina.com/jean-harlow.html,good,"[celebrina, com, jean, harlow, html]","[celebrina, com, jean, harlow, html]",celebrina com jean harlow html
138889,1304.yippie.biz/mo/kansas_city/,good,"[yippie, biz, mo, kansas, city]","[yippi, biz, mo, kansa, citi]",yippi biz mo kansa citi
395197,mylife.com/104743147,good,"[mylife, com]","[mylif, com]",mylif com


In [16]:
#COUNT VECTORIZER
#CountVectorizer is used to transform a corpora of text to a vector of term / token counts.
cv = CountVectorizer()

In [17]:
feature = cv.fit_transform(phish_data.text_sent) #transform all text which we tokenize and stemed
feature[:5].toarray() # convert sparse matrix into array to print transformed features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
#SPLITTING THE DATA
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)

In [19]:
#LOGISTIC REGRESSION
lr = LogisticRegression()

In [20]:
lr.fit(trainX,trainY)

In [21]:
lr.score(testX,testY)

0.9641538696782368

In [22]:
#MULTINOMINAL NAIVE BAYES
#Applying Multinomial Naive Bayes to NLP Problems. Naive Bayes Classifier Algorithm is a family of probabilistic algorithms based on applying Bayes' theorem with the “naive” assumption of conditional independence between every pair of a feature.
mnb = MultinomialNB()

In [23]:
mnb.fit(trainX,trainY)

In [24]:
mnb.score(testX,testY)

0.9580812162782062

In [25]:
# So, Logistic Regression is the best fit model, Now we make sklearn pipeline using Logistic Regression

In [26]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
##(r'\b(?:http|ftp)s?://\S*\w|\w+|[^\w\s]+') ([a-zA-Z]+)([0-9]+)  -- these tolenizers giving me low accuray 

In [27]:
trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)

In [28]:
pipeline_ls.fit(trainX,trainY)

In [29]:
pipeline_ls.fit(trainX,trainY)

0.9665130299919177

In [30]:
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))

In [31]:
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)

0.9665130299919177
