In [17]:
import ipaddress
import re
import requests
from datetime import date
from dateutil.parser import parse as date_parse

# Calculates number of months
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

# Generate data set by extracting the features from the URL
def generate_data_set(url):

    data_set = []

    # Converts the given URL into standard format
    if not re.match(r"^https?", url):
        url = "http://" + url
    
    # Stores the response of the given URL
    try:
        response = requests.get(url)
    except:
        response = ""

    # Extracts domain from the given URL
    domain = re.findall(r"://([^/]+)/?", url)[0]

    # Requests all the information about the domain
    whois_response = requests.get("https://www.whois.com/whois/"+domain)

    rank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {
        "name": domain
    })

    # Extracts global rank of the website
    try:
        global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
    except:
        global_rank = -1

    # having_IP_Address
    try:
        ipaddress.ip_address(url)
        data_set.append(1)
    except:
        data_set.append(-1)
    
    # URL_Length
    if len(url) < 54:
        data_set.append(1)
    elif len(url) >= 54 and len(url) <= 75:
        data_set.append(0)
    else:
        data_set.append(-1)
    
    # Shortining_Service
    if re.findall("goo.gl|bit.ly", url):
        data_set.append(1)
    else:
        data_set.append(-1)
    
    # having_At_Symbol
    if re.findall("@", url):
        data_set.append(1)
    else:
        data_set.append(-1)
    
    # double_slash_redirecting
    if re.findall(r"[^https?:]//",url):
        data_set.append(-1)
    else:
        data_set.append(1)
    
    # Prefix_Suffix
    if re.findall(r"https?://[^\-]+-[^\-]+/", url):
        data_set.append(-1)
    else:
        data_set.append(1)

    # having_Sub_Domain
    if len(re.findall("\.", url)) == 1:
        data_set.append(-1)
    elif len(re.findall("\.", url)) == 2:
        data_set.append(0)
    else:
        data_set.append(1)
    
    # SSLfinal_State
    data_set.append(-1)

    # Domain_registeration_length
    data_set.append(-1)

    # Favicon
    data_set.append(-1)

    # port
    try:
        port = domain.split(":")[1]
        if port:
            data_set.append(1)
        else:
            data_set.append(-1)
    except:
        data_set.append(-1)

    # HTTPS_token
    if re.findall("^https\-", domain):
        data_set.append(-1)
    else:
        data_set.append(1)

    # Request_URL
    data_set.append(-1)

    # URL_of_Anchor
    data_set.append(-1)

    # Links_in_tags
    data_set.append(-1)

    # SFH
    data_set.append(0)

    # Submitting_to_email
    if re.findall(r"[mail\(\)|mailto:?]", response.text):
        data_set.append(1)
    else:
        data_set.append(-1)

    # Abnormal_URL
    if response.text == "":
        data_set.append(1)
    else:
        data_set.append(-1)

    # Redirect
    if len(response.history) <= 1:
        data_set.append(-1)
    elif len(response.history) <= 4:
        data_set.append(0)
    else:
        data_set.append(1)

    # on_mouseover
    if re.findall("<script>.+onmouseover.+</script>", response.text):
        data_set.append(1)
    else:
        data_set.append(-1)

    # RightClick
    if re.findall(r"event.button ?== ?2", response.text):
        data_set.append(1)
    else:
        data_set.append(-1)

    # popUpWidnow
    if re.findall(r"alert\(", response.text):
        data_set.append(1)
    else:
        data_set.append(-1)

    # Iframe
    if re.findall(r"[<iframe>|<frameBorder>]", response.text):
        data_set.append(1)
    else:
        data_set.append(-1)

    # age_of_domain
    try:
        registration_date = re.findall(r'Registration Date:</div><div class="df-value">([^<]+)</div>', whois_response.text)[0]
        if diff_month(date.today(), date_parse(registration_date)) >= 6:
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(1)

    # DNSRecord
    data_set.append(-1)

    # web_traffic
    try:
        if global_rank > 0 and global_rank < 100000:
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(1)

    # Page_Rank
    try:
        if global_rank > 0 and global_rank < 100000:
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(1)

    # Google_Index
    try:
        if global_rank > 0 and global_rank < 100000:
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(1)

    # Links_pointing_to_page
    number_of_links = len(re.findall(r"<a href=", response.text))
    if number_of_links == 0:
        data_set.append(1)
    elif number_of_links <= 2:
        data_set.append(0)
    else:
        data_set.append(-1)

    # Statistical_report
    data_set.append(-1)

    print (data_set)

    return data_set


from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


import numpy as np
import sys

def load_data():
    '''
    Load data from CSV file
    '''
    # Load the training data from the CSV file
    training_data = np.genfromtxt('dataset.csv', delimiter=',', dtype=np.int32)

    # Extract the inputs from the training data
    inputs = training_data[:,:-1]

    # Extract the outputs from the training data
    outputs = training_data[:, -1]

    # This model follow 80-20 rule on dataset
    # Split 80% for traning and 20% testing
    boundary = int(0.8*len(inputs))

    training_inputs, training_outputs, testing_inputs, testing_outputs = train_test_split(inputs, outputs, test_size=0.33)

    # Return the four arrays
    return training_inputs, training_outputs, testing_inputs, testing_outputs

def run(classifier, name):
    '''
    Run the classifier to calculate the accuracy score
    '''
    # Load the training data
    train_inputs, test_inputs,train_outputs, test_outputs = load_data()

    # Train the decision tree classifier
    classifier.fit(train_inputs, train_outputs)

    # Use the trained classifier to make predictions on the test data
    predictions = classifier.predict(test_inputs)

    # Print the accuracy (percentage of phishing websites correctly predicted)
    accuracy = 100.0 * accuracy_score(test_outputs, predictions)
    print ("Accuracy score using {} is: {}\n".format(name, accuracy))


if __name__ == '__main__':
    '''
    Main function -
    Following are several models trained to detect phishing webstes.
    Only the best and worst classifier outputs are displayed.
    '''

    # Decision tree
    # classifier = tree.DecisionTreeClassifier()
    # run(classifier, "Decision tree")

    # Random forest classifier (low accuracy)
    # classifier = RandomForestClassifier()
    # run(classifier, "Random forest")

    # Custom random forest classifier 1
    print ("Best classifier for detecting phishing websites.")
    classifier = RandomForestClassifier(n_estimators=500, max_depth=15, max_leaf_nodes=10000)
    run(classifier, "Random forest")

    # Linear SVC classifier
    # classifier = svm.SVC(kernel='linear')
    # run(classifier, "SVC with linear kernel")

    # RBF SVC classifier
    # classifier = svm.SVC(kernel='rbf')
    # run(classifier, "SVC with rbf kernel")

    # Custom SVC classifier 1
    # classifier = svm.SVC(decision_function_shape='ovo', kernel='linear')
    # run(classifier, "SVC with ovo shape")

    # Custom SVC classifier 2
    # classifier = svm.SVC(decision_function_shape='ovo', kernel='rbf')
    # run(classifier, "SVC with ovo shape")

    # NuSVC classifier
    # classifier = svm.NuSVC()
    # run(classifier, "NuSVC")

    # OneClassSVM classifier
    print ("Worst classifier for detecting phishing websites.")
    classifier = svm.OneClassSVM()
    run(classifier, "One Class SVM")

    # print "K nearest neighbours algorithm."
    # nbrs = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
    # run(nbrs, "K nearest neighbours")

    # Gradient boosting classifier
    # classifier = GradientBoostingClassifier()
    # run(classifier, "GradientBoostingClassifier")

    # Take user input and check whether its phishing URL or not.
    str="https://colab.research.google.com/drive/1Iynd-w-oeSLGL8CSELuTauZZlxWVxqfc"
    if len(str) > 1:
        data_set = generate_data_set(str)

        # Reshape the array
        data_set = np.array(data_set).reshape(1, -1)

        # Load the date
        train_inputs, test_inputs,train_outputs, test_outputs = load_data()

        # Create and train the classifier
        classifier = RandomForestClassifier(n_estimators=500, max_depth=15, max_leaf_nodes=10000)
        classifier.fit(train_inputs, train_outputs)

        print( classifier.predict(data_set))
        list=classifier.predict(data_set)
        if(list[-1]==1):
          print("Phishing")
        else:
          print("Not pHishing")

Best classifier for detecting phishing websites.
Accuracy score using Random forest is: 96.7662373252946

Worst classifier for detecting phishing websites.
Accuracy score using One Class SVM is: 48.09536859413538

[-1, 0, -1, -1, 1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1]
[-1]
Not pHishing
