# ML Project

## Setup

## Install Packages

In [None]:
%%capture
pip install pandas numpy matplotlib tensorflow tqdm bs4 IP2Location chardet scikit-learn ipywidgets widgetsnbextension

In [None]:
%%capture
!conda install -c conda-forge -y ipywidgets
!conda install -c conda-forge -y tqdm
!jupyter nbextension enable --py widgetsnbextension

### Import Modules

In [None]:
import requests
import urllib3
from urllib.parse import urlparse, urljoin
import socket

from bs4 import BeautifulSoup
import chardet
import re
import json
import ast


import math
import statistics
from matplotlib import pyplot

import os
import IP2Location

import pandas as pd
import numpy as np

import random
import itertools

from tqdm.notebook import tqdm
from IPython.display import display

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
print(tf.__version__)

from sklearn.model_selection import train_test_split, KFold
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

import warnings
warnings.filterwarnings(action="ignore", module="urllib3")

## Download More Data / Feature Engineering

### Set Constants

In [None]:
DOWNLOAD_CHUNK_SIZE = 100
TIMEOUT = (10, 10)
feature_names = ["words", "aux", "city", "region", "country", "redirects", "latitude", "longitude"]

### Define Helper Functions

In [None]:
ip2location_database = IP2Location.IP2Location()
ip2location_database.open(os.path.join("location_data", "IP2LOCATION-LITE-DB11.BIN"))
def get_location(ip_addr=None, hostname=None):
    if(ip_addr is None and hostname is not None):
        try:
            ip_addr = socket.gethostbyname(hostname)
        except socket.gaierror:
            print("Skipped Location Download (Hostname Resolution Error for '"+hostname+"')")
            return None
    location_data = ip2location_database.get_all(ip_addr)
    if(location_data.country_short == "-"):
        if(not ip_addr in location_database):
            location_data = requests.get("https://ipinfo.io/"+ip_addr+"/json").json()
            if("error" in location_data):
                raise Exception("Failed because error with download (probably api quota exceded)")
            location_database[ip_addr] = location_data
            location_database[ip_addr]["country_short"] = location_data["country"]
            location_database[ip_addr]["latitude"], location_database[ip_addr]["longitude"] = tuple(location_data["loc"].split(","))
    else:
        return ast.literal_eval(str(location_data))
    return location_database[ip_addr]

In [None]:
def get_absolute_url_from_relative(original_url, new_url):
    if(new_url.startswith("javascript")):
        return None
    if(new_url.startswith("http")):
        return new_url
    url_with_scheme = "http://"+original_url if not original_url.startswith("http") else original_url
    parsed_url = urlparse(url_with_scheme)
    url_scheme = parsed_url.scheme
    url_host = parsed_url.netloc
    
    return urljoin(url_scheme+"://"+url_host, new_url)

In [None]:
def download_aux_data(content, original_url):
    aux = []
    soup = BeautifulSoup(content, 'html.parser')
    for a in soup.find_all('a'):
        href = a.get('href')
        if(href is not None):
            url = get_absolute_url_from_relative(original_url, href)
            if(url is not None):
                hostname = urlparse(url).netloc
                json = get_location(hostname=hostname)
                if(json is None):
                    continue
                try:
                    columns = [
                        "hostname_"+hostname,
                        "ip_addr_"+json["ip"],
                        "latitude_"+str(int(float(json["latitude"]))),
                        "longitude_"+str(int(float(json["longitude"]))),
                        "city_"+json["city"],
                        "region_"+json["region"],
                        "country_"+json["country_short"],
                    ]
                except KeyError:
                    if("bogon" in json and json["bogon"]):
                        print("Skipped Aux Data Download (Bogon IP)")
                    else:
                        print("Problem with JSON: ", json)
                    columns = []
                
                columns = [ re.compile('[\W_]+').sub('_', column) for column in columns ]
                
                aux = aux + columns
                
    return " ".join(aux)

In [None]:
def download_one(original_url, timeout=TIMEOUT):
    row = {}
    row["attempted_download"] = True

    url_with_scheme = "http://"+original_url if not original_url.startswith("http") else original_url
    parsed_url = urlparse(url_with_scheme)
    hostname = parsed_url.netloc
    
    try:
        row["hostname"] = hostname
        
        ip_addr = socket.gethostbyname(hostname)
        row["ip_addr"] = ip_addr

        r = requests.get(url_with_scheme, verify=False, timeout=timeout)

        row["status_code"] = r.status_code
        encoding = chardet.detect(r.content)['encoding']
        #if(encoding == None):
        #    print("Skipped content download (Decoding Error)")
        try:
            content = r.content.decode(encoding)
            row["content"] = content
        except:
            try:
                content = r.content.decode("utf")
            except:
                content = None
                print("Skipped Content Download (Decoding Error)")
            
        if(content is not None):
            row["aux"] = download_aux_data(content, original_url)
            row["words"] = " ".join(re.compile('[\W_]+').sub(' ', BeautifulSoup(content, 'html.parser').get_text()).split(" "))
        
        redirects = 0
        for r_history in r.history:
            if(r_history.status_code == 301):
                redirects = redirects + 1
        row["redirects"] = redirects

        json = get_location(ip_addr=ip_addr)
        if(json is None):
            return row
        try:
            row["latitude"] = json["latitude"]
            row["longitude"] = json["longitude"]
            row["city"] = json["city"]
            row["region"] = json["region"]
            row["country"] = json["country_short"]

        except KeyError:
            if("bogon" in json and json["bogon"]):
                print("Skipped Location Download (Bogon IP)")
            else:
                print("Problem with JSON: ", json)
        
    except socket.gaierror:
        print("Skipped (Hostname Resolution Error for '"+hostname+"')")
                      
    except socket.error:
        print("Skipped (Content Download Error for '"+url_with_scheme+"')")
        
    except UnicodeError:
        print("Skipped (Unicode Error for '"+url_with_scheme+"')")
    
    return row

In [None]:
def download_chunk(chunk_size=DOWNLOAD_CHUNK_SIZE, timeout=TIMEOUT):
    global raw_data
    if(len(raw_data[raw_data["attempted_download"] == True].index) != 0):
          start = raw_data[raw_data["attempted_download"] == True].index[-1]+1
    end = start + chunk_size
    end = end if len(raw_data["url"]) > end else len(raw_data["url"])
    
    print("Downloading %d more rows ([%d:%d])" % (chunk_size, start, end))
    for row_index in tqdm(range(start, end)):
        row = download_one(raw_data.loc[row_index, "url"], timeout=timeout)
        if(len(list(row)) > 0):
            raw_data.loc[row_index, list(row)] = row.values()

In [None]:
def load_data():
    global raw_data, location_database
    
    raw_data = pd.read_csv("./raw_data.csv")
    raw_data["attempted_download"] = False
    
    try:
        with open("./location_data.json", 'r') as file:
            location_database = json.load(file)
        file.close()
    except FileNotFoundError:
        location_database = {}

    try:
        raw_data = pd.read_csv("./data.csv", index_col=0, low_memory=False)
        if(DOWNLOAD_CHUNK_SIZE > 0):
            download_chunk()
            raw_data.to_csv("./data.csv")
            with open("./location_data.json", 'w') as file:
                json.dump(location_database, file)
            file.close()

    except FileNotFoundError:
        download_chunk()
        raw_data.to_csv("./data.csv")
        with open("./location_data.json", 'w') as file:
            json.dump(location_database, file)
        file.close()

    data = raw_data.copy()

    data = data.drop("attempted_download", axis=1)
    data = data[data["status_code"] == 200]
    data = data.drop("status_code", axis=1)
    data = data[feature_names + ["type", "url"]].dropna()
    data = data.reset_index(drop=True)
    data["redirects"] = data["redirects"].astype(str)
    data["latitude"] = data["latitude"].astype(str)
    data["longitude"] = data["longitude"].astype(str)
    return data

### Load/Download Data and display data

In [None]:
for _ in range(2):
    data = load_data()
data

## Model Building

### Define Hyperparameter Constants

In [None]:
max_features = 10000
sequence_length = 1000
epochs = 25
folds = 5
batch_size = 1
test_size = 0.33
validation_size = 0.20 # ratio after test has been taken out
seed = 123

### Ensure Reproducibility (important for feature subsets comparing)

In [None]:
def reset_random_seed():
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = '0'

    tf.keras.utils.set_random_seed(seed)
    tf.config.experimental.enable_op_determinism()

#reset_random_seed()
# Adapted from:
# https://stackoverflow.com/questions/45230448/how-to-get-reproducible-result-when-running-keras-with-tensorflow-backend
# AND
# https://stackoverflow.com/questions/61078946/how-to-get-reproducible-results-keras-tensorflow

### Define Metrics

In [None]:
METRICS = [
      #tf.keras.metrics.TruePositives(name='tp'),
      #tf.keras.metrics.FalsePositives(name='fp'),
      #tf.keras.metrics.TrueNegatives(name='tn'),
      #tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      #tf.keras.metrics.Precision(name='precision'),
      #tf.keras.metrics.Recall(name='recall'),
      #tf.keras.metrics.AUC(name='auc'),
      #tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

### Define string_lookup for One Hot Encoding the labels/type column

In [None]:
def get_y_string_lookup(dataset=None):
    def geny(X, y):
        return tf.reshape(y, (1,5,))
    
    string_lookup = tf.keras.layers.StringLookup(output_mode='one_hot')
    if(dataset == None):
        string_lookup.adapt(data["type"])
    else:
        string_lookup.adapt(dataset.map(geny))
    return string_lookup

## Define Gen Functions

In [None]:
def gen(X, y):
    X_out = {}
    y_out = tf.reshape(y, (1,5,))
    for i in range(len(feature_names)):
        feature_name = feature_names[i]
        X_out[feature_name] = tf.reshape(X[i], (1,))
    return X_out, y_out

def genx(X, y=None):
        X_out = {}
        for i in range(len(feature_names)):
            feature_name = feature_names[i]
            X_out[feature_name] = tf.reshape(X[i], (1,))
        return X_out
    
def geny(X, y):
    return tf.reshape(y, (1,5,))

def genx_custom_feature_name(X, y, feature_name, dtype=str):
    for i in range(len(feature_names)):
        if(feature_name == feature_names[i]):
            if(dtype != str):
                return tf.strings.to_number(tf.reshape(X[i], (1,)), out_type=tf.dtypes.float32)
            else:
                return tf.reshape(X[i], (1,))

    raise Exception("Should never get here")
    return None # should never get here

### Define Get Model Function

In [None]:
def get_model(feature_names=feature_names, train=None):
    
    def custom_standardization(input_data):
        return tf.strings.lower(input_data)

    def get_normalization_layer(feature_name):
        normalization_layer = tf.keras.layers.Normalization(axis=None)
        #print(np.array(train.map(genx_custom_feature_name), dtype=np.float32))
        normalization_layer.adapt(train.map(lambda X, y: genx_custom_feature_name(X, y, feature_name, np.float32))) #.astype(np.float32)) #TODO: ensure this doesn't use efficelty use test dataset for training
        return normalization_layer

    def get_vectorize_layer(feature_name):
        vectorize_layer = tf.keras.layers.TextVectorization(
            standardize=custom_standardization,
            max_tokens=max_features,
            output_mode='int',
            output_sequence_length=sequence_length
        )
        vectorize_layer.adapt(train.map(lambda X, y: genx_custom_feature_name(X, y, feature_name))) #TODO: ensure this doesn't use efficelty use test dataset for training
        return vectorize_layer

    def get_string_lookup(feature_name):
        lookup = tf.keras.layers.StringLookup(
            output_mode='one_hot',
            max_tokens=sequence_length,
            pad_to_max_tokens=True,
        )
        lookup.adapt(train.map(lambda X, y: genx_custom_feature_name(X, y, feature_name))) #TODO: ensure this doesn't use efficelty use test dataset for training
        return lookup
    
    
    
    inputs = []
    outputs = []
    for i in range(len(feature_names)):
        
        feature_name = feature_names[i]
        flatten = tf.keras.layers.Flatten()
        if(feature_name == "latitude" or feature_name == "longitude"):
            #outputs.append(flatten(get_normalization_layer(feature_name)(tf.strings.to_number(inputs[:,i], out_type=tf.dtypes.float32))))
            layer = get_normalization_layer(feature_name)
            inputs.append(tf.keras.Input(shape=(1,), dtype=tf.string, name=feature_name))
            outputs.append(flatten(layer(tf.strings.to_number(inputs[i], out_type=tf.dtypes.float32))))
            
            
        else:
            embedding_layer = tf.keras.layers.Embedding(max_features, 5)
            if(feature_name == "words" or feature_name == "aux"):
                layer = get_vectorize_layer(feature_name)
            else:
                layer = get_string_lookup(feature_name)
            #outputs.append(tf.keras.layers.Flatten()(embedding_layer(layer(flatten(inputs[:,i])))))
            
            inputs.append(tf.keras.Input(shape=(1,), dtype=tf.string, name=feature_name))
            outputs.append(flatten(embedding_layer(layer(tf.keras.layers.Flatten()(inputs[i])))))
    
    outputs = tf.concat(outputs, axis=-1)
    

    sequential_model = tf.keras.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(10, activation='relu'),
        #tf.keras.layers.Dropout(0.5),
        #tf.keras.layers.Dense(5, activation='relu'),
        tf.keras.layers.Dense(5)
    ])

    
    model = tf.keras.Model(inputs=inputs, outputs=sequential_model(outputs) )

    
    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=METRICS
    )
    
    model.summary()
    sequential_model.summary()
    
    return model

## Train Model

### Define Dataset Getter Functions

In [None]:
def get_dataset(data=data, feature_names=feature_names):
    X = data[feature_names]
    y = get_y_string_lookup()(list(data["type"]))
    
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    
    train_and_validation, test = tf.keras.utils.split_dataset(dataset, right_size=test_size, shuffle=True, seed=seed)
    
    train, validation = tf.keras.utils.split_dataset(dataset, right_size=validation_size, shuffle=True, seed=seed)
    
    return train, validation, test

### Define Train Model Function

In [None]:
def train_model(model, train, validation):
    #kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)

    histories = []
    
    
    y_classes = np.argmax(np.concatenate([y for _, y in train.map(gen)]), axis=1)
    y_labels = np.unique(y_classes)
    class_weights = compute_class_weight(class_weight='balanced', classes=y_labels, y=y_classes)
    class_weights = dict(zip(y_labels, class_weights))
    class_weights[0] = 0
    
    history = model.fit(
        train.map(gen),
        validation_data=validation.map(gen),
        epochs=epochs,
        #batch_size=batch_size,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)], #TODO: Try commenting this out
        class_weight=class_weights
    )
    histories.append(history)
        
    return histories

### Generate Feature Combination subsets

In [None]:
def generate_feature_combination_subsets():
    feature_names_subsets = []
    for N in range(len(feature_names) + 1):
         for feature_names_subset in itertools.combinations(feature_names, N): # adapted from: https://stackoverflow.com/questions/464864/get-all-possible-2n-combinations-of-a-list-s-elements-of-any-length
            feature_names_subset = list(feature_names_subset)
            if(len(feature_names_subset) > 0):
                feature_names_subsets.append(feature_names_subset)
    return feature_names_subsets

### Train and test all feature combinations

In [None]:
!mkdir -p saved_models
models = []
#feature_names_subsets = generate_feature_combination_subsets()
#for i in tqdm(range(len(feature_names_subsets)-1, 0, -1)):
#reset_random_seed()
#feature_names_subset = feature_names_subsets[i]
feature_names_subset = feature_names

train, validation, test = get_dataset(data, feature_names_subset)

model = get_model(feature_names_subset, train)
histories = train_model(model, train, validation)
model.evaluate(test.map(gen))
models.append({"features_names_subset":feature_names_subset, "model":model, "histories":histories, "train": train, "validation":validation, "test":test})
#model.save("saved_model/"+",".join(feature_names_subset))

    

## Evaluate Trained Model

### Define helper functions

In [None]:
def one_hot_encoding_to_type_string(one_hot_encoding, test):
    return get_y_string_lookup().get_vocabulary()[np.argmax(tf.nn.softmax(one_hot_encoding))]

### Define visualization functions

In [None]:
def plot_epochs_vs_accuracy_and_loss(model, histories):
    acc = []
    val_acc = []
    loss = []
    val_loss = []
    for history in histories:
        acc = acc + history.history['accuracy']
        val_acc = val_acc + history.history['val_accuracy']

        loss = loss + history.history['loss']
        val_loss = val_loss + history.history['val_loss']


    epochs_range = range(len(acc))

    pyplot.figure(figsize=(8, 8))
    pyplot.subplot(1, 2, 1)
    pyplot.plot(epochs_range, acc, label='Training Accuracy')
    pyplot.plot(epochs_range, val_acc, label='Validation Accuracy')
    pyplot.legend(loc='lower right')
    pyplot.title('Training and Validation Accuracy')

    pyplot.subplot(1, 2, 2)
    pyplot.plot(epochs_range, loss, label='Training Loss')
    pyplot.plot(epochs_range, val_loss, label='Validation Loss')
    pyplot.legend(loc='upper right')
    pyplot.title('Training and Validation Loss')
    pyplot.show()

In [None]:
def show_prediction_vs_actual_dataframe_table(model, test):
    
    urls = [ data.loc[i, "url"] for i in range(len(test.map(genx))) ]
    predictions = [ one_hot_encoding_to_type_string(i, test) for i in model.predict(test.map(genx)) ]
    actuals = [ one_hot_encoding_to_type_string(i, test) for i in test.map(geny) ]

    df = pd.DataFrame(np.array([urls, actuals, predictions]).T, columns=["url", "actual", "predicted"])
    df.reset_index(drop=True)

    with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
        display(df[df["actual"] == df["predicted"]])
        display(df[df["actual"] != df["predicted"]])
    return df

In [None]:
for m in models:
    features_names_subset = m["features_names_subset"]
    model = m["model"]
    histories = m["histories"]
    test = m["test"]
    plot_epochs_vs_accuracy_and_loss(model, histories)
    show_prediction_vs_actual_dataframe_table(model, test)

## Predict More (Test) Data

In [None]:
def predict(url):
    one = download_one(url)
    if(one is None or ("status_code" in one and one["status_code"] != 200) or not any([feature_name in one for feature_name in feature_names])):
        return "Failed to get data for '" + url + "'"
    one = pd.DataFrame([one.values()], columns=list(one))
    one["redirects"] = one["redirects"].astype(str)
    one["latitude"] = one["latitude"].astype(str)
    one["longitude"] = one["longitude"].astype(str)
    
    tfds = tf.data.Dataset.from_tensor_slices(one[feature_names])
    
    outputs = []
    for m in models:
        features_names_subset = m["features_names_subset"]
        model = m["model"]
        histories = m["histories"]
        #outputs.append(", ".join(features_names_subset)+": "+one_hot_encoding_to_type_string(model.predict(tfds.map(genx)), test))
        outputs.append(one_hot_encoding_to_type_string(model.predict(tfds.map(genx), verbose=None), test))
    return "\n".join(outputs)

### Test URLs not in dataset

In [None]:
print(predict("https://cnn.com/"))
print(predict("https://google.com/"))
print(predict("https://disneyplus.com/"))
print(predict("https://uvm.edu/"))
print(predict("https://en.wikipedia.org/"))

In [None]:
print(predict("http://irever.live/r.php?q=NzkyNjM0NDs2MTg3OTsxMDA2OzM0OzI7MjAyMy0wMy0yMSAxNTozMDowNTs5OzE7bDs7"))
print(predict("https://click.yescaloriedietplan.com/?t=c&ids=NDM5NjExOTY3__NDIxMg%3D%3D__OTg0ODE1ODc%3D__OTAx__1102&url=aHR0cHMlM0ElMkYlMkZ0cmFja2luZy5oZWFsdGhpZXJsaWZ0LmNvbSUyRnpwdGk="))
print(predict("http://www.folifort.email/l/lt1CK11667E122SG/5494NX7579O10146YB349N80481934SK3221472804"))

# TODO
1. Train and test a model on every combination of features to see what combinations are most important. - In Progress
2. Create a constant seed to reduce random noise-based accuracy changing between models - Done
3. Crate a confusion matrix

## Notes for Writeup
### Limitation
1. Redirects being treated as catagorical value