# Detecting malicious URLs

In [19]:
# GENERAL
import os
import math
import re
import datetime
import time
import pandas as pd
import matplotlib as plt
import seaborn as sns
from tqdm import tqdm

For this experiment, we will be using the Sklearn library along with it's tools to assist us along the data science process.

In [20]:
# SKLEARN
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

Here we import some libraries to assist us in pulling, parsing, and transforming our domain information.

In [21]:
# DOMAIN SPECIFIC LIBRARIES
import tldextract
from datetime import datetime
import warnings
from urllib.parse import urlparse
from socket import gethostbyname, gaierror, timeout
import whois

In [22]:
# iPython and Notebook config
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #display all results

%config InlineBackend.figure_format = 'retina' #see plots in retina displays
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Data Ingestion

### The bad stuff

Exploring the Internet, we find a nice list of maliciousness, provided by http://www.malwaredomainlist.com/

Link: http://www.malwaredomainlist.com/hostslist/hosts.txt

In [23]:
# Ingest malicious urls from malwaredomainlist
columns = ['ip', 'url']

mal_df = pd.read_csv('../data/hosts.txt', 
                     delimiter = '\s+', 
                     encoding = "ISO-8859-1", 
                     skiprows=6,
                     names = columns)[['url']]

In [24]:
# Randomly sample 500 elements from your dataframe
mal_sample_df = mal_df.sample(n=1000)

In [25]:
mal_sample_df['class'] = 'malicious'

In [26]:
mal_sample_df.head()

Unnamed: 0,url,class
862,vitalityxray.com,malicious
755,traff1.com,malicious
979,www.joomlalivechat.com,malicious
221,f.gj555.net,malicious
758,treventuresonline.com,malicious


### The mostly benign stuff

In [None]:
# Ingest Alex top 1 million urls 
columns = ['url']

benign_df = pd.read_csv('../data/top-1m.csv', 
                     encoding = "ISO-8859-1", 
                     names = columns)

In [None]:
benign_sample_df = benign_df.iloc[0:1000]

In [None]:
benign_sample_df['class'] = 'benign'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
benign_sample_df.head()

Unnamed: 0,url,class
1,google.com,benign
2,youtube.com,benign
3,baidu.com,benign
4,facebook.com,benign
5,qq.com,benign


In [None]:
# Frames we would like to merge
full_data = [mal_sample_df, benign_sample_df]
print(mal_sample_df.shape, benign_sample_df.shape)

(1000, 2) (1000, 2)


In [None]:
tqdm.pandas()

def creation_date(domain_name):
    """
    Gets creation date of domain from whois
    """
    
    # Get creation date of Domain
    currentDT = datetime.now()
    default_date = currentDT.strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        creation_date = whois.whois(domain_name).creation_date
        if type(creation_date) is list:
            return creation_date[0]
#         elif str(creation_date).find('Aug'):
#             creation_date = "1996-07-01 00:00:01"
#             return creation_date
        elif creation_date == None:
            return default_date
        else:
            return creation_date
    except whois.parser.PywhoisError:
        return default_date
    except gaierror:
        return default_date
    except socket.timeout:
        return default_date
    except socket.error:
        return default_date

# Generate creation date
for df in full_data:
    df['domain_creation'] = df['url'].progress_apply(lambda x: creation_date(x))

  1%|▏         | 13/1000 [00:16<13:17,  1.24it/s]

In [None]:
tqdm.pandas()

def last_update_date(domain_name):
    """
    Gets creation date of domain from whois
    """
    
    # Get creation date of Domain
    currentDT = datetime.now()
    default_date = currentDT.strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        last_updated = whois.whois(domain_name).updated_date
        if type(last_updated) is list:
            return last_updated[0]
        elif last_updated == None:
            return default_date
        else:
            return last_updated
    except whois.parser.PywhoisError:
        return default_date
    except gaierror:
        return default_date
    except socket.timeout:
        return default_date
    except socket.error:
        return default_date

# Generate creation date
for df in full_data:
    df['domain_update'] = df['url'].progress_apply(lambda x: last_update_date(x))

In [None]:
def host_ip(domain):
    """
    Gets Host IP of Domain
    """

    # Get HOST IP     
    try:
        host = gethostbyname(domain)
        return host
    except gaierror:
        return 'missing'

# Generate host ip
for df in full_data:
    df['host_ip'] = df['url'].apply(lambda x: host_ip(x))

In [None]:
# Generate number of special characters
for df in full_data:
    df['specials'] = df['url'].apply(lambda x: len(re.sub('[\w]+' ,'', x)))

In [None]:
# Get domain
for df in full_data:
    df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).domain)    

In [None]:
# Get URL suffix
for df in full_data:
    df['suffix'] = df['url'].apply(lambda x: tldextract.extract(x).suffix)

In [None]:
# Generate domain name
for df in full_data:
    df['domain_name'] = df['url'].apply(lambda x: tldextract.extract(x).registered_domain)

In [None]:
# Get IP first octet
for df in full_data:
    df['prefix'] = df['host_ip'].str.extract('(\d+)\.').astype(int, errors='ignore').astype(str)
    df['prefix'] = df['prefix'].fillna('missing')
    df['prefix'] = df['prefix'].replace('nan', 'missing')

In [None]:
def entropy(string):
    """
    Calculates the Shannon entropy of a string
    """

    # Get probability of chars in string
    prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]

    # Calculate the entropy
    entropy = - sum([ p * math.log(p) / math.log(2.0) for p in prob ])

    return entropy

# Calculate entropy
for df in full_data:
    df['entropy'] = df['url'].apply(lambda x: entropy(str(x)))

In [None]:
benign_sample_df['domain_creation'] = pd.to_datetime(benign_sample_df['domain_creation'], errors='coerce')
benign_sample_df['domain_creation'].groupby([benign_sample_df["domain_creation"].dt.year, benign_sample_df["domain_creation"].dt.month]).count().plot(kind="bar")

In [None]:
mal_sample_df['domain_creation'] = pd.to_datetime(mal_sample_df['domain_creation'], errors='coerce')
mal_sample_df['domain_creation'].groupby([mal_sample_df["domain_creation"].dt.year, mal_sample_df["domain_creation"].dt.month]).count().plot(kind="bar")

In [None]:
mal_sample_df['domain_update'] = pd.to_datetime(mal_sample_df['domain_update'], errors='coerce')
mal_sample_df['domain_update'].groupby([mal_sample_df["domain_update"].dt.year, mal_sample_df["domain_update"].dt.month]).count().plot(kind="bar")

In [None]:
benign_sample_df['domain_update'] = pd.to_datetime(benign_sample_df['domain_update'], errors='coerce')
benign_sample_df['domain_update'].groupby([benign_sample_df["domain_update"].dt.year, benign_sample_df["domain_update"].dt.month]).count().plot(kind="bar")