In [1]:
# EDA Packages
import pandas as pd
import numpy as np
import random
import joblib

# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [12]:
import joblib

In [13]:
# Load Url Data 
urls_data = pd.read_csv("urldata.csv")

In [14]:
type(urls_data)

pandas.core.frame.DataFrame

In [15]:
urls_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [16]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [17]:
url = "https://www.example.com/page-1/sub-page.example.html"
tokens = makeTokens(url)
print(tokens)

['', 'www', '1', "page.example.html'", "html'", 'www.example.com', "b'https:", 'page', 'example', 'sub']


In [18]:
# Labels
y = urls_data["type"]

In [19]:
y.head()

0      phishing
1        benign
2        benign
3    defacement
4    defacement
Name: type, dtype: object

In [20]:
# Features
url_list = urls_data["url"]

In [21]:
url_list.head()

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: url, dtype: object

In [22]:
# Using Default Tokenizer
#vectorizer = TfidfVectorizer()

# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens, token_pattern=None)


In [23]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)	

In [60]:
# Model Building
#using logistic regression
logit = LogisticRegression(max_iter=10000)  # Increased from the default 1000
logit.fit(X_train, y_train)


In [61]:
# Accuracy of Our Model
print("Accuracy ",logit.score(X_test, y_test))

Accuracy  0.973993964941377


## Save Model

In [62]:
joblib.dump(logit, "model_url.pkl")

['model_url.pkl']

# Run the model using saving data

In [25]:
classifier = joblib.load("model_url.pkl")

In [4]:
# save model
filename = 'D:/Projects/MachineLearning/url/url.model'
joblib.dump(logit, filename)

NameError: name 'logit' is not defined

In [5]:
#load model from the saved location
filename = 'D:/Projects/MachineLearning/url/url.model'
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(result)

NameError: name 'X_test' is not defined

In [31]:
filename = 'D:/Projects/MachineLearning/url/url.model'
loaded_model = joblib.load(filename)

### Predicting 

In [30]:
X_predict = ["courseweb.sliit.lk/",
"www.connexit.biz/",
"www.facebook.com", 
"www.google.lk", 
"ahrenhei.without-transfer.ru/nethost.exe ",
"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]

In [31]:
# Getting data from the save location
X_predict = vectorizer.transform(X_predict)
New_predict = classifier.predict(X_predict)

In [70]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)

AttributeError: lower not found

In [32]:
print(New_predict)

['benign' 'phishing' 'phishing' 'phishing' 'benign' 'malware']


In [39]:
# https://db.aa419.org/fakebankslist.php
X_predict1 = ["www.buyfakebillsonlinee.blogspot.com", 
"www.unitedairlineslogistics.com",
"www.stonehousedelivery.com",
"www.silkroadmeds-onlinepharmacy.com" ]

In [40]:
X_predict1 = vectorizer.transform(X_predict1)
New_predict1 = logit.predict(X_predict1)
print(New_predict1)

['phishing' 'phishing' 'phishing' 'phishing']


In [21]:
# Using Default Tokenizer
vectorizer = TfidfVectorizer()

In [22]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)	

In [24]:
# Model Building

logit = LogisticRegression(max_iter=10000)	#using logistic regression
logit.fit(X_train, y_train)


In [25]:
# Accuracy of Our Model with our Custom Token
print("Accuracy ",logit.score(X_test, y_test))

Accuracy  0.9489553820284247


In [None]:
# Thanks For Watching
#J-Secur1ty
#Jesus Saves @ JCharisTech