# Detecting malicious URLs

In [91]:
# GENERAL
import os
import math
import re
import datetime
import time
import pandas as pd
import matplotlib as plt
import seaborn as sns
from tqdm import tqdm

For this experiment, we will be using the Sklearn library along with it's tools to assist us along the data science process.

In [92]:
# SKLEARN
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

Here we import some libraries to assist us in pulling, parsing, and transforming our domain information.

In [93]:
# DOMAIN SPECIFIC LIBRARIES
import tldextract
from datetime import datetime
import warnings
from urllib.parse import urlparse
from socket import gethostbyname, gaierror, timeout
import whois

In [94]:
# iPython and Notebook config
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #display all results

%config InlineBackend.figure_format = 'retina' #see plots in retina displays
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Data Ingestion

### The bad stuff

Exploring the Internet, we find a nice list of maliciousness, provided by http://www.malwaredomainlist.com/

Link

In [95]:
# Ingest malicious urls from malwaredomainlist
columns = ['ip', 'domain']

mal_df = pd.read_csv('../data/hosts.txt', 
                     delimiter = '\s+', 
                     encoding = "ISO-8859-1", 
                     skiprows=6,
                     names = columns)[['domain']]

In [96]:
# Randomly sample 500 elements from your dataframe
mal_sample_df = mal_df.sample(n=500)

In [97]:
mal_sample_df['class'] = 'malicious'

In [98]:
mal_sample_df.head()

Unnamed: 0,domain,class
984,www.litra.com.mk,malicious
720,strangeduckfilms.com,malicious
490,pacan.gofreedom.info,malicious
587,seet10.jino.ru,malicious
23,ads.wikipartes.com,malicious


### The mostly benign stuff

In [99]:
# Ingest Alex top 1 million urls 
columns = ['domain']

benign_df = pd.read_csv('../data/top-1m.csv', 
                     encoding = "ISO-8859-1", 
                     names = columns)

In [100]:
benign_sample_df = benign_df.sample(n=500)

In [101]:
benign_sample_df['class'] = 'benign'

In [102]:
benign_sample_df.head()

Unnamed: 0,domain,class
250424,downloadnow-1.com,benign
694009,lesroches.edu,benign
246429,yasni.co.uk,benign
663159,youtubeing.com,benign
767944,cryptorange.info,benign


In [103]:
# Frames we would like to merge
full_data = [mal_sample_df, benign_sample_df]
print(mal_sample_df.shape, benign_sample_df.shape)

(500, 2) (500, 2)


In [105]:
tqdm.pandas()

def creation_date(domain_name):
    """
    Gets creation date of domain from whois
    """
    
    # Get creation date of Domain
    currentDT = datetime.now()
    default_date = currentDT.strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        creation_date = whois.whois(domain_name).creation_date
        if type(creation_date) is list:
            return creation_date[0]
        elif str(creation_date).find('Aug'):
            creation_date = "1996-07-01 00:00:01"
            return creation_date
        elif creation_date == np.nan:
            return default_date
        else:
            return creation_date
    except whois.parser.PywhoisError:
        return default_date
    except gaierror:
        return default_date
    except socket.timeout:
        return default_date

# Generate creation date
for df in full_data:
    df['domain_creation'] = df['domain'].progress_apply(lambda x: creation_date(x))

  5%|▍         | 24/500 [00:33<11:58,  1.51s/it]

ConnectionResetError: [Errno 54] Connection reset by peer

In [None]:
# Generate number of special characters
for df in full_data:
    df['specials'] = df['domain'].apply(lambda x: re.sub('[\w]+' ,'', x))