# Malverticus Model Build V2 - Kaggle Build

## Setup

### Imports

In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)


sns.set_style("darkgrid")

SEED = 1612

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
import string
from urllib.parse import urlparse

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

## Helper Functions

### Data Prep

In [2]:
df = pd.read_csv('data/malicious_phish.csv', header=0,)
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
print(df.columns)


Index(['url', 'type'], dtype='object')


### Creating 1 Target

In [4]:
def label_type(row):
    if row['type'] == 'benign':
        return 0
    else:
        return 1

# Create a new column
df['Label'] = df.apply(label_type, axis=1)
df.head()

Unnamed: 0,url,type,Label
0,br-icloud.com.br,phishing,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1


### Delete the type column

In [5]:
df = df.drop(columns=['type'])

In [6]:
df.head()

Unnamed: 0,url,Label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


## Feature Creation

### Get URL Length

In [7]:
def get_url_length(url):
    # Remove common prefixes
    prefixes = ['http://', 'https://']
    for prefix in prefixes:
        if url.startswith(prefix):
            url = url[len(prefix):]

    # Remove 'www.' if present
    url = url.replace('www.', '')

    # Return the length of the remaining URL
    return len(url)

In [8]:
df['url_len'] = df['url'].apply(lambda x: get_url_length(str(x)))

In [9]:
df.head()


Unnamed: 0,url,Label,url_len
0,br-icloud.com.br,1,16
1,mp3raid.com/music/krizz_kaliko.html,0,35
2,bopsecrets.org/rexroth/cr/1.htm,0,31
3,http://www.garage-pirenne.be/index.php?option=...,1,77
4,http://adventure-nicaragua.net/index.php?optio...,1,228


### Counting different characters

In [10]:
def count_letters(url):
    num_letters = sum(char.isalpha() for char in url)
    return num_letters

In [11]:
df['letters_count'] = df['url'].apply(lambda x: count_letters(x))

In [12]:
def count_digits(url):
    num_digits = sum(char.isdigit() for char in url)
    return num_digits

In [13]:
df['digits_count'] = df['url'].apply(lambda x: count_digits(x))

In [14]:
df.head()

Unnamed: 0,url,Label,url_len,letters_count,digits_count
0,br-icloud.com.br,1,16,13,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1
3,http://www.garage-pirenne.be/index.php?option=...,1,77,63,7
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22


In [15]:
def count_special_chars(url):
    special_chars = set(string.punctuation)
    num_special_chars = sum(char in special_chars for char in url)
    return num_special_chars

In [16]:
df['special_chars_count']  = df['url'].apply(lambda x: count_special_chars(x))

In [17]:
df.head()

Unnamed: 0,url,Label,url_len,letters_count,digits_count,special_chars_count
0,br-icloud.com.br,1,16,13,0,3
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5
3,http://www.garage-pirenne.be/index.php?option=...,1,77,63,7,18
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14


### Get If https?

In [18]:
def secure_http(url):
    return int(urlparse(url).scheme == 'https')

In [19]:
df['secure_http']  = df['url'].apply(lambda x: secure_http(x))

In [20]:
df['secure_http'].value_counts()

secure_http
0    635511
1     15680
Name: count, dtype: int64

### Get if has IP address

In [21]:
import ipaddress
def have_ip_address(url):
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname:
            ip = ipaddress.ip_address(parsed_url.hostname)
            if isinstance(ip, (ipaddress.IPv4Address, ipaddress.IPv6Address)):
                return 1
    except ValueError:
        pass  # Invalid hostname or IP address

    return 0

In [22]:
df['have_ip']  = df['url'].apply(lambda x: have_ip_address(x))

In [23]:
df.head()

Unnamed: 0,url,Label,url_len,letters_count,digits_count,special_chars_count,secure_http,have_ip
0,br-icloud.com.br,1,16,13,0,3,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0
3,http://www.garage-pirenne.be/index.php?option=...,1,77,63,7,18,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,0


In [24]:
df['have_ip'].value_counts()

have_ip
0    639102
1     12089
Name: count, dtype: int64

In [25]:
df.isnull().sum()

url                    0
Label                  0
url_len                0
letters_count          0
digits_count           0
special_chars_count    0
secure_http            0
have_ip                0
dtype: int64

### Has Shortening service

In [26]:
import re
def has_shortening_service(url):
    pattern = re.compile(r'https?://(?:www\.)?(?:\w+\.)*(\w+)\.\w+')
    match = pattern.search(url)
    
    if match:
        domain = match.group(1)
        common_shortening_services = ['bit', 'goo', 'tinyurl', 'ow', 't', 'is',
                                      'cli', 'yfrog', 'migre', 'ff', 'url4', 'twit',
                                      'su', 'snipurl', 'short', 'BudURL', 'ping', 
                                      'post', 'Just', 'bkite', 'snipr', 'fic', 
                                      'loopt', 'doiop', 'short', 'kl', 'wp', 
                                      'rubyurl', 'om', 'to', 'bit', 't', 'lnkd', 
                                      'db', 'qr', 'adf', 'goo', 'bitly', 'cur', 
                                      'tinyurl', 'ow', 'bit', 'ity', 'q', 'is', 
                                      'po', 'bc', 'twitthis', 'u', 'j', 'buzurl', 
                                      'cutt', 'u', 'yourls', 'x', 'prettylinkpro', 
                                      'scrnch', 'filoops', 'vzturl', 'qr', '1url', 
                                      'tweez', 'v', 'tr', 'link', 'zip']
        
        if domain.lower() in common_shortening_services:
            return 1
    return 0 

In [27]:
df['shortened']  = df['url'].apply(lambda x: has_shortening_service(x))

In [28]:
df.head()

Unnamed: 0,url,Label,url_len,letters_count,digits_count,special_chars_count,secure_http,have_ip,shortened
0,br-icloud.com.br,1,16,13,0,3,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,1,77,63,7,18,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,0,0


In [29]:
df.duplicated().sum()

10066

In [30]:
df.drop_duplicates(inplace=True)

In [31]:
df.duplicated().sum()

0

In [32]:
data = df.drop(columns=['url'])

In [33]:
data.columns

Index(['Label', 'url_len', 'letters_count', 'digits_count',
       'special_chars_count', 'secure_http', 'have_ip', 'shortened'],
      dtype='object')

### Splitting Data

In [34]:
target = 'Label'

In [35]:
cols = [target] + [col for col in data if col != target]
data = data[cols]

In [36]:
print(data.head())

   Label  url_len  letters_count  digits_count  special_chars_count  \
0      1       16             13             0                    3   
1      0       35             29             1                    5   
2      0       31             25             1                    5   
3      1       77             63             7                   18   
4      1      228            199            22                   14   

   secure_http  have_ip  shortened  
0            0        0          0  
1            0        0          0  
2            0        0          0  
3            0        0          0  
4            0        0          0  


In [37]:
print(data.shape)

X = data.drop("Label", axis=1)
Y = data[target]
X.head()

(641125, 8)


Unnamed: 0,url_len,letters_count,digits_count,special_chars_count,secure_http,have_ip,shortened
0,16,13,0,3,0,0,0
1,35,29,1,5,0,0,0
2,31,25,1,5,0,0,0
3,77,63,7,18,0,0,0
4,228,199,22,14,0,0,0


In [38]:
# Split the data into training and testing sets
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=SEED)

In [39]:
print(f"Xtrain shape =",{Xtrain.shape})
print(f"Xtest shape =",{Xtest.shape})
print(f"Ytrain shape =",{Ytrain.shape})
print(f"Ytest shape =",{Ytest.shape})

train_df = pd.merge(Xtrain, Ytrain, left_index=True, right_index=True) 
print(f"Merged Training data shape =",{train_df.shape})

test_df = pd.merge(Xtest, Ytest, left_index=True, right_index=True) 
print(f"Merged Test data shape =",{test_df.shape})

Xtrain shape = {(512900, 7)}
Xtest shape = {(128225, 7)}
Ytrain shape = {(512900,)}
Ytest shape = {(128225,)}
Merged Training data shape = {(512900, 8)}
Merged Test data shape = {(128225, 8)}


In [40]:
Ytrain.head()

504615    0
453642    0
298583    0
21719     0
221015    0
Name: Label, dtype: int64

In [41]:
cols = [target] + [col for col in test_df if col != target]
test_df = test_df[cols]

In [42]:
cols = [target] + [col for col in train_df if col != target]
train_df = train_df[cols]

In [43]:
train_df.head()

Unnamed: 0,Label,url_len,letters_count,digits_count,special_chars_count,secure_http,have_ip,shortened
504615,0,18,14,0,4,0,0,0
453642,0,14,12,0,2,0,0,0
298583,0,61,48,5,8,0,0,0
21719,0,47,40,0,7,0,0,0
221015,0,25,21,0,4,0,0,0


### Hiding Scaler

In [44]:
#scaler = StandardScaler()

In [45]:
#Xtrain = scaler.fit_transform(Xtrain)
#Xtest = scaler.transform(Xtest)

In [46]:
# Convert Xtrain and Xtest to DataFrame
#Xtrain_df = pd.DataFrame(Xtrain, columns=[f'feature_{i}' for i in range(Xtrain.shape[1])])
#Xtest_df = pd.DataFrame(Xtest, columns=[f'feature_{i}' for i in range(Xtest.shape[1])])

# Convert Ytrain and Ytest to Series directly without setting index
#Ytrain_series = pd.Series(Ytrain, name='target')
#Ytest_series = pd.Series(Ytest, name='target')

# Ensure indices are consistent by resetting if necessary
#Xtrain_df.reset_index(drop=True, inplace=True)
#Ytrain_series.reset_index(drop=True, inplace=True)
#Xtest_df.reset_index(drop=True, inplace=True)
#Ytest_series.reset_index(drop=True, inplace=True)

# Merge the DataFrames and Series using the same index
#train_df = pd.concat([Xtrain_df, Ytrain_series], axis=1)
#test_df = pd.concat([Xtest_df, Ytest_series], axis=1)

# Print shapes as a sanity check
#print(f"Xtrain shape = {Xtrain_df.shape}")
#print(f"Xtest shape = {Xtest_df.shape}")
#print(f"Ytrain shape = {Ytrain_series.shape}")
#print(f"Ytest shape = {Ytest_series.shape}")
#print(f"Merged Training data shape = {train_df.shape}")
#print(f"Merged Test data shape = {test_df.shape}")


## Move data to S3

In [47]:
#Ytrain_series.head()

In [48]:
test_df.isnull().sum()

Label                  0
url_len                0
letters_count          0
digits_count           0
special_chars_count    0
secure_http            0
have_ip                0
shortened              0
dtype: int64

In [49]:
#train_df = pd.concat([Ytrain_series, Xtrain_df], axis=1)
#test_df = pd.concat([Ytest_series, Xtest_df], axis=1)

# Verify the new arrangement
print("New training DataFrame head:")
print(train_df.head())
print("\nNew testing DataFrame head:")
print(test_df.head())

New training DataFrame head:
        Label  url_len  letters_count  digits_count  special_chars_count  \
504615      0       18             14             0                    4   
453642      0       14             12             0                    2   
298583      0       61             48             5                    8   
21719       0       47             40             0                    7   
221015      0       25             21             0                    4   

        secure_http  have_ip  shortened  
504615            0        0          0  
453642            0        0          0  
298583            0        0          0  
21719             0        0          0  
221015            0        0          0  

New testing DataFrame head:
        Label  url_len  letters_count  digits_count  special_chars_count  \
40901       0       76             51            10                   15   
333602      0       52             47             0                    5   
47094

In [50]:
import boto3
bucket_name = 'sagemaker-malverticus-build-deploy-modelv2'

train_df.to_csv('data.csv', header = False, index = False)
key = 'data/train/data'

url = 's3://{}/{}'.format(bucket_name , key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

test_df.to_csv('data.csv', header = False, index = False)
key = 'data/test/data'

url = 's3://{}/{}'.format(bucket_name , key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

## Model Creation

In [51]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

key = 'model/xgboost_model'

s3_output_location = url = 's3://{}/{}'.format(bucket_name , key) 

xgb_model = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name, 'xgboost'),
    get_execution_role(),
    train_instance_count =1,
    train_instance_type = 'ml.m4.xlarge',
    train_volume_size = 5,
    output_path = s3_output_location,
    sagemaker_session = sagemaker.Session()

)

xgb_model.set_hyperparameters(
    objective='binary:logistic', 
    num_round=50,  
    max_depth=0, 
    eta=0.2, 
    subsample=0.7,  #
    colsample_bytree=0.8  
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Model Training

In [52]:
train_data_location = 's3://{}/{}'.format(bucket_name, 'data/train')
validation_data_location = 's3://{}/{}'.format(bucket_name, 'data/test')

train_channel = sagemaker.session.s3_input(train_data_location, content_type = 'text/csv') 
validation_channel =  sagemaker.session.s3_input(validation_data_location, content_type = 'text/csv') 

data_channels = {'train': train_channel, 'validation': validation_channel}

xgb_model.fit(inputs = data_channels)

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-04-23-15-57-39-865


2024-04-23 15:57:40 Starting - Starting the training job...
2024-04-23 15:57:54 Starting - Preparing the instances for training......
2024-04-23 15:59:01 Downloading - Downloading input data......
2024-04-23 15:59:47 Downloading - Downloading the training image...
2024-04-23 16:00:32 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2024-04-23:16:00:43:INFO] Running standalone xgboost training.[0m
[34m[2024-04-23:16:00:43:INFO] File size need to be processed in the node: 11.39mb. Available memory size in the node: 8516.08mb[0m
[34m[2024-04-23:16:00:43:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:00:43] S3DistributionType set as FullyReplicated[0m
[34m[16:00:43] 512900x7 matrix with 3590300 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-04-23:16:00:43:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:00:43] S3DistributionType set as FullyReplicated[0m
[3

In [53]:
from sagemaker.inputs import TrainingInput

# Assuming bucket_name and xgb_model have been defined correctly

# Define the paths to your train and test datasets stored in S3
train_data_location = f's3://{bucket_name}/data/train'
validation_data_location = f's3://{bucket_name}/data/test'

# Set up the channels for training and test data
train_channel = TrainingInput(train_data_location, content_type='text/csv')
validation_channel = TrainingInput(validation_data_location, content_type='text/csv')

# Prepare the data channels dictionary
data_channels = {'train': train_channel, 'validation': validation_channel}

# Fit the model
xgb_model.fit(inputs=data_channels)


INFO:sagemaker:Creating training-job with name: xgboost-2024-04-23-16-01-52-306


2024-04-23 16:01:52 Starting - Starting the training job...
2024-04-23 16:02:07 Starting - Preparing the instances for training......
2024-04-23 16:03:14 Downloading - Downloading input data......
2024-04-23 16:03:54 Downloading - Downloading the training image...
2024-04-23 16:04:40 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-04-23:16:04:52:INFO] Running standalone xgboost training.[0m
[34m[2024-04-23:16:04:52:INFO] File size need to be processed in the node: 11.39mb. Available memory size in the node: 8490.8mb[0m
[34m[2024-04-23:16:04:52:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:04:52] S3DistributionType set as FullyReplicated[0m
[34m[16:04:53] 512900x7 matrix with 3590300 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-04-23:16:04:53:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:04:53] S3DistributionType set as FullyReplicated[0m
[34m

## Deploy Model

In [54]:
xgb_classifier = xgb_model.deploy(initial_instance_count = 1,
                                 instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-04-23-16-06-04-759
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-04-23-16-06-04-759
INFO:sagemaker:Creating endpoint with name xgboost-2024-04-23-16-06-04-759


-----!