In [2]:
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [154]:
phishing_v1 = pd.read_csv('../datasets/phishing/phishing.csv')

phishing_v1 = phishing_v1.rename(columns={'URL': 'url', 'Label': 'label'})
phishing_v1.head(5)

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [155]:
phishing_v2 = pd.read_csv('../datasets/phishing/phishing-v1.csv')

phishing_v2.head(5)

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kaspi.kz,good
4,avtobys.kz,good


In [156]:
phishing_v3 = pd.read_csv('../datasets/phishing/phishing-v2.csv')

phishing_v3 = phishing_v3.drop(columns=['result'])
phishing_v3['label'] = phishing_v3['label'].replace({'malicious': 'bad', 'benign': 'good'})
phishing_v3.head(5)

Unnamed: 0,url,label
0,https://www.google.com,good
1,https://www.youtube.com,good
2,https://www.facebook.com,good
3,https://www.baidu.com,good
4,https://www.wikipedia.org,good


In [157]:
phishing_v4 = pd.read_csv('../datasets/malicious.csv')

phishing_v4 = phishing_v4.rename(columns={'type': 'label'})
phishing_v4['label'] = phishing_v4['label'].replace({'phishing': 'bad', 'benign': 'good'})
benign_subset = phishing_v4[phishing_v4['label'] == 'benign'].sample(frac=0.25)
phsihing_subset = phishing_v4[(phishing_v4['label'] == 'bad')]

phishing_v4 = pd.concat([benign_subset, phsihing_subset])

phishing_v4.head(5)

Unnamed: 0,url,label
0,br-icloud.com.br,bad
21,signin.eby.de.zukruygxctzmmqi.civpro.co.za,bad
28,http://www.marketingbyinternet.com/mo/e56508df...,bad
40,https://docs.google.com/spreadsheet/viewform?f...,bad
72,retajconsultancy.com,bad


In [158]:
df = pd.concat([phishing_v1, phishing_v2, phishing_v3, phishing_v4], ignore_index=True)

df = df.drop_duplicates(subset='url')
df.head(5)

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1091761 entries, 0 to 1648250
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   url     1091761 non-null  object
 1   label   1091761 non-null  object
dtypes: object(2)
memory usage: 25.0+ MB


In [160]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

tokenizer.tokenize(df.url[0])

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [161]:
df['text_tokenized'] = df.url.map(lambda t: tokenizer.tokenize(t))

In [162]:
stemmer = SnowballStemmer('english')

df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word = word) for word in l])

df.head(5)

Unnamed: 0,url,label,text_tokenized,text_stemmed
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,..."
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin..."
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into..."
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp..."
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide..."


In [163]:
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

df.head(5)

Unnamed: 0,url,label,text_tokenized,text_stemmed,text_sent
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,...",nobel it ffb d dca cce f login skype com en cg...
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin...",www dghjdgf com paypal co uk cycgi bin webscrc...
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into...",serviciosbi com paypal cgi bin get into herf s...
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp...",mail printakid com www onlin americanexpress c...
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide...",thewhiskeydreg com wp content theme widescreen...


In [164]:
cv = CountVectorizer()

feature = cv.fit_transform(df.text_sent)

feature[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [165]:
trainX, testX, trainY, testY = train_test_split(feature, df.label)

In [166]:
lr = LogisticRegression()

lr.fit(trainX, trainY)

lr.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9721111888649928

In [167]:
mnb = MultinomialNB()

mnb.fit(trainX, trainY)

mnb.score(testX, testY)

0.970422179152271

In [168]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words = 'english'), LogisticRegression())

trainX, testX, trainY, testY = train_test_split(df.url, df.label)

pipeline_ls.fit(trainX, trainY)

pipeline_ls.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9730930860515643

In [169]:
pickle.dump(pipeline_ls, open('../models/phishing.pkl', 'wb'))

loaded_model = pickle.load(open('../models/phishing.pkl', 'rb'))

result = loaded_model.score(testX, testY)

print(result)

0.9730930860515643


In [4]:
loaded_model = pickle.load(open('../models/phishing.pkl', 'rb'))

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer('english')

test = ['mercadolivre.com.br.premiosfidelidade2012.com.br/', 'phlebolog.com.ua/libraries/joomla/results.php', 'ydomain4pro.com/online/account/secure-upgard/net-verify/', 
        'linkedin.com', 'https://kaspi.kz', 'https://egov.kz']

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer('english')

tokenized = [tokenizer.tokenize(t) for t in test]
stemmed = [[stemmer.stem(word=word) for word in l] for l in tokenized]
sent = [' '.join(l) for l in stemmed] 


print(loaded_model.predict(sent))

['bad' 'bad' 'bad' 'good' 'good' 'good']
