In [11]:
# Data management
import pandas as pd

# For stratified 10-fold cross validation
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn ML Models
from sklearn.ensemble import RandomForestClassifier

# Normalization
from keras.utils import normalize, to_categorical

print('Imports complete.')

ModuleNotFoundError: No module named 'keras'

In [12]:
def train_and_eval_on(X, y, feature_set):
    """
    train_and_eval_on function
        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target)

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training

        Returns:
            metrics => dictionary where the model names are the key and a list of accuracies across all folds is the value
                    Keys:
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb
                        Keras-TensorFlow => keras
                        Fast.ai => fastai
    """
    metrics = {'rf':[],
                'dt':[],
                'knn':[],
                'svm':[],
                'lr':[],
                'lda':[],
                'ab':[],
                'nb':[],
                'keras':[],
                'fastai':[]}

    # Select the given features within the data
    X = X[feature_set]

    # Create stratified, 10-fold cross validation object
    random_state = 0
    sss = StratifiedKFold(n_splits=10, shuffle=True,                                                random_state=random_state)

    # Experiment with 10-fold cross validation
    for train_idx, test_idx in sss.split(X, y):
        # Split the data into the training and testing sets
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Random Forest Model
        rf = RandomForestClassifier(random_state=random_state)
        rf.fit(X_train, y_train)
        score = rf.score(X_test, y_test)
        metrics['rf'].append(score)

    return metrics

## Data Preparation

In [13]:
# These are the best features for multi-class experiment provided from the feature selection jupyter notebook
best_features_multiclass = ['Entropy_Afterpath', 'argPathRatio', 'NumberRate_AfterPath', 'NumberRate_Domain', 'ArgUrlRatio', 'Extension_DigitCount', 'dld_getArg', 'ldl_getArg', 'Query_DigitCount', 'LongestVariableValue', 'Querylength', 'Query_LetterCount', 'ldl_path', 'ArgLen', 'ldl_url', 'dld_path', 'Extension_LetterCount', 'argDomanRatio', 'dld_url', 'URL_DigitCount', 'LongestPathTokenLength', 'URLQueries_variable', 'fileNameLen', 'delimeter_Count', 'NumberRate_URL', 'SymbolCount_Domain', 'domain_token_count', 'tld', 'SymbolCount_Extension', 'this.fileExtLen', 'pathLength', 'subDirLen', 'urlLen', 'charcompace', 'host_DigitCount', 'SymbolCount_Afterpath', 'URL_Letter_Count', 'pathDomainRatio', 'SymbolCount_FileName', 'domainUrlRatio', 'NumberRate_Extension', 'NumberRate_FileName', 'SymbolCount_URL', 'Entropy_Filename', 'Entropy_DirectoryName', 'delimeter_path', 'Arguments_LongestWordLength', 'charcompvowels', 'CharacterContinuityRate', 'spcharUrl', 'executable', 'pathurlRatio', 'Filename_LetterCount', 'Entropy_Extension', 'dld_filename', 'ldl_filename', 'SymbolCount_Directoryname', 'avgdomaintokenlen', 'path_token_count', 'File_name_DigitCount', 'NumberRate_DirectoryName', 'delimeter_Domain', 'Domain_LongestWordLength', 'NumberofDotsinURL', 'Directory_DigitCount', 'Directory_LetterCount', 'URL_sensitiveWord', 'longdomaintokenlen', 'ldl_domain', 'domainlength', 'Entropy_Domain', 'host_letter_count', 'avgpathtokenlen', 'isPortEighty', 'sub-Directory_LongestWordLength', 'dld_domain', 'Path_LongestWordLength', 'Entropy_URL']

## Data Preparation

In [14]:
# Import the data
path = './FinalDataset/'
fille = 'All.csv'
df = pd.read_csv(path + fille)
print('Data Read:')
print(df.head())

Data Read:
   Querylength  domain_token_count  path_token_count  avgdomaintokenlen  \
0            0                   4                 5                5.5   
1            0                   4                 5                5.5   
2            0                   4                 5                5.5   
3            0                   4                12                5.5   
4            0                   4                 6                5.5   

   longdomaintokenlen  avgpathtokenlen  tld  charcompvowels  charcompace  \
0                  14         4.400000    4               8            3   
1                  14         6.000000    4              12            4   
2                  14         5.800000    4              12            5   
3                  14         5.500000    4              32           16   
4                  14         7.333334    4              18           11   

   ldl_url  ...  SymbolCount_FileName  SymbolCount_Extension  \
0        0  ...  

In [15]:
dep_var = 'URL_Type_obf_Type'

print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

There are 80 columns and 36707 rows in the provided data.


In [16]:
print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

Below is the dataset's composition
Defacement    7930
benign        7781
phishing      7586
malware       6712
spam          6698
Name: URL_Type_obf_Type, dtype: int64


In [17]:
# Removes all rows if they contain NaN values
df.dropna(axis='index', inplace=True)

In [18]:
print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64


In [19]:
# Create the X (data) and y (labels)
X = normalize( df.loc[:, df.columns != dep_var] )
y = df[dep_var]

NameError: name 'normalize' is not defined

## Multi-class Experiments

In [20]:
for i in range(5, len(best_features_multiclass)):
    features = best_features_multiclass[:i]
    performance_metrics = train_and_eval_on(X=X, y=y, feature_set=features)

    print(performance_metrics)

NameError: name 'X' is not defined