In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import randint
from joblib import load, dump
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

## Dataset Prep

In [None]:
dataset = pd.read_csv('/content/gdrive/MyDrive/dataset_7_features.csv')
dataset.sample(10)

Unnamed: 0,Domain,Label,DGA Family,Length,Relative Entropy,Numerical Percentage,Max Consecutive Consonants,Max Consecutive Vowels,Vowel Count,Vowel Rate
488740,christophermillicent,0,suppobox_3,20,0.932421,0.0,3,1,6,0.3
349726,uksakaxibe,0,symmi,10,1.753113,0.0,2,1,5,0.5
814729,grr,1,,3,3.676597,0.0,3,0,0,0.0
456369,markedasentpeopleand,0,rovnix,20,0.958229,0.0,3,2,8,0.4
174238,venkrana,0,legit,8,1.925757,0.0,3,1,3,0.375
537375,afn,1,,3,2.884556,0.0,2,1,1,0.333333
442939,9enw5urktu7g,0,qadars,12,2.001317,0.25,3,1,3,0.25
169475,sgyuyya,0,kraken_v2,7,2.809578,0.0,3,1,2,0.285714
779537,sparkscience,1,,12,1.276005,0.0,4,2,4,0.333333
619857,web-biz,1,,7,2.853986,0.0,1,1,2,0.285714


In [None]:
X = np.array(dataset[['Length', 'Relative Entropy', 'Numerical Percentage', 'Max Consecutive Consonants', 'Max Consecutive Vowels', 'Vowel Count', 'Vowel Rate']])
y = np.array(dataset['Label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f'Training set samples - {X_train.shape[0]}')
print(f'Testing set samples - {X_test.shape[0]}')

Training set samples - 814401
Testing set samples - 203601


In [None]:
def metrics_report(y_test, predictions):
    print("Accuracy :", accuracy_score(y_test, predictions))

    precision = precision_score(y_test, predictions, average='macro')
    recall = recall_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

## Decision Tree

In [None]:
param_dist_DT = {
    "max_depth": [2, 4, 8, 12, 16, 20],
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"]
}

In [None]:
DecisionTree_grid = RandomizedSearchCV(DecisionTreeClassifier(), param_dist_DT, cv=5, verbose=2, \
                                       scoring='accuracy', n_iter=50, n_jobs=-1, random_state=42)
DecisionTree_grid.fit(X_train, y_train)



Fitting 5 folds for each of 24 candidates, totalling 120 fits


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [2, 4, 8, 12, 16, 20],
                                        'splitter': ['best', 'random']},
                   random_state=42, scoring='accuracy', verbose=2)

In [None]:
print(f'Best score: {DecisionTree_grid.best_score_}')
print(f'Parameters: {DecisionTree_grid.best_params_}')

Best score: 0.7952814393471223
Parameters: {'splitter': 'best', 'max_depth': 12, 'criterion': 'gini'}


In [None]:
DecisionTree_clf = DecisionTreeClassifier(max_depth=12, criterion='gini', splitter='best')
DecisionTree_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=12)

In [None]:
preds = DecisionTree_clf.predict(X_test)
metrics_report(y_test, preds)

Accuracy : 0.7938517001389973
Macro-average quality numbers
Precision: 0.8019, Recall: 0.7939, F1-measure: 0.7925
              precision    recall  f1-score   support

           0       0.85      0.71      0.78    101844
           1       0.75      0.88      0.81    101757

    accuracy                           0.79    203601
   macro avg       0.80      0.79      0.79    203601
weighted avg       0.80      0.79      0.79    203601

[[72539 29305]
 [12667 89090]]


In [None]:
dump(DecisionTree_grid, '/content/gdrive/MyDrive/Models_DGA/DecisonTree_grid.joblib')
dump(DecisionTree_clf, '/content/gdrive/MyDrive/Models_DGA/DecisonTree_clf.joblib')

['/content/gdrive/MyDrive/Models_DGA/DecisonTree_clf.joblib']

## Random Forest

In [None]:
param_dist_RF = {
    "max_depth": [8, 12],
    "n_estimators": [100, 200]
}

In [None]:
RandomForest_grid = RandomizedSearchCV(RandomForestClassifier(verbose=1, n_jobs=-1), param_dist_RF, cv=5, verbose=2,\
                                       scoring='accuracy', n_iter=50, n_jobs=-1, random_state=42)
RandomForest_grid.fit(X_train, y_train)

In [None]:
print(f'Best score: {RandomForest_grid.best_score_}')
print(f'Parameters: {RandomForest_grid.best_params_}')

In [None]:
RandomForest_clf = RandomForestClassifier(max_depth=12, n_estimators=200, n_jobs=-1)
RandomForest_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=12, n_estimators=200, n_jobs=-1)

In [None]:
preds = RandomForest_clf.predict(X_test)
metrics_report(y_test, preds)

Accuracy : 0.7948634829887868
Macro-average quality numbers
Precision: 0.8065, Recall: 0.7949, F1-measure: 0.7929
              precision    recall  f1-score   support

           0       0.87      0.70      0.77    101844
           1       0.75      0.89      0.81    101757

    accuracy                           0.79    203601
   macro avg       0.81      0.79      0.79    203601
weighted avg       0.81      0.79      0.79    203601

[[71056 30788]
 [10978 90779]]


In [None]:
#dump(RandomForest_grid, '/content/gdrive/MyDrive/Models_DGA/RandomForest_grid.joblib')
dump(RandomForest_clf, '/content/gdrive/MyDrive/Models_DGA/RandomForest_clf.joblib')

['/content/gdrive/MyDrive/Models_DGA/RandomForest_clf.joblib']

## Gradient Boosting

In [None]:
param_dist_GB = {
    "loss": ["deviance", "exponential"],
    "learning_rate": [0.1, 0.4, 1],
    "n_estimators": [80, 100, 200]
}

In [None]:
GradientBoost_grid = RandomizedSearchCV(GradientBoostingClassifier(verbose=1), param_dist_GB, cv=5, verbose=2, \
                                       scoring='accuracy', n_iter=50, n_jobs=-1, random_state=42)
GradientBoost_grid.fit(X_train, y_train)



Fitting 5 folds for each of 18 candidates, totalling 90 fits
      Iter       Train Loss   Remaining Time 
         1           1.1594            4.99m
         2           1.0575            4.88m
         3           0.9980            4.89m
         4           0.9692            4.86m
         5           0.9418            4.86m
         6           0.9256            4.84m
         7           0.9132            4.84m
         8           0.9001            4.82m
         9           0.8914            4.81m
        10           0.8864            4.79m
        20           0.8582            4.50m
        30           0.8459            4.25m
        40           0.8386            3.99m
        50           0.8333            3.74m
        60           0.8311            3.48m
        70           0.8280            3.22m
        80           0.8245            2.97m
        90           0.8236            2.72m
       100           0.8220            2.47m
       200           0.8122           

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(verbose=1),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.4, 1],
                                        'loss': ['deviance', 'exponential'],
                                        'n_estimators': [80, 100, 200]},
                   random_state=42, scoring='accuracy', verbose=2)

In [None]:
print(f'Best score: {GradientBoost_grid.best_score_}')
print(f'Parameters: {GradientBoost_grid.best_params_}')

Best score: 0.796417243197147
Parameters: {'n_estimators': 200, 'loss': 'deviance', 'learning_rate': 0.4}


In [None]:
GradientBoost_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.4, loss='deviance', random_state=42)
GradientBoost_clf.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.4, n_estimators=200, random_state=42)

In [None]:
preds = GradientBoost_clf.predict(X_test)
metrics_report(y_test, preds)

Accuracy : 0.7958556195696485
Macro-average quality numbers
Precision: 0.8026, Recall: 0.7959, F1-measure: 0.7947
              precision    recall  f1-score   support

           0       0.85      0.72      0.78    101844
           1       0.76      0.87      0.81    101757

    accuracy                           0.80    203601
   macro avg       0.80      0.80      0.79    203601
weighted avg       0.80      0.80      0.79    203601

[[73472 28372]
 [13192 88565]]


In [None]:
dump(GradientBoost_grid, '/content/gdrive/MyDrive/Models_DGA/GradientBoost_grid.joblib')
dump(GradientBoost_clf, '/content/gdrive/MyDrive/Models_DGA/GradientBoost_clf.joblib')

['/content/gdrive/MyDrive/Models_DGA/GradientBoost_clf.joblib']

## Gaussian Naive Bayes

In [None]:
param_dist_NB = {
    "var_smoothing": [1e-9, 2e-9, 3e-9]
}

In [None]:
GaussianNB_grid = RandomizedSearchCV(GaussianNB(), param_dist_NB, cv=5, verbose=2, \
                                       scoring='accuracy', n_iter=50, n_jobs=-1, random_state=42)
GaussianNB_grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




RandomizedSearchCV(cv=5, estimator=GaussianNB(), n_iter=50, n_jobs=-1,
                   param_distributions={'var_smoothing': [1e-09, 2e-09, 3e-09]},
                   random_state=42, scoring='accuracy', verbose=2)

In [None]:
print(f'Best score: {GaussianNB_grid.best_score_}')
print(f'Parameters: {GaussianNB_grid.best_params_}')

Best score: 0.7405135810773492
Parameters: {'var_smoothing': 1e-09}


In [None]:
GaussianNB_clf = GaussianNB(var_smoothing=1e-09)
GaussianNB_clf.fit(X_train, y_train)

GaussianNB()

In [None]:
preds = GaussianNB_clf.predict(X_test)
metrics_report(y_test, preds)

Accuracy : 0.7399226919317685
Macro-average quality numbers
Precision: 0.7554, Recall: 0.7400, F1-measure: 0.7360
              precision    recall  f1-score   support

           0       0.82      0.62      0.70    101844
           1       0.69      0.86      0.77    101757

    accuracy                           0.74    203601
   macro avg       0.76      0.74      0.74    203601
weighted avg       0.76      0.74      0.74    203601

[[62846 38998]
 [13954 87803]]


In [None]:
dump(GaussianNB_grid, '/content/gdrive/MyDrive/Models_DGA/GaussianNB_grid.joblib')
dump(GaussianNB_clf, '/content/gdrive/MyDrive/Models_DGA/GaussianNB_clf.joblib')

['/content/gdrive/MyDrive/Models_DGA/GaussianNB_clf.joblib']

## Logistic Regression

In [None]:
param_dist_LR = {
    "solver": ["lbfgs", "sag"],
    "max_iter": [100, 200, 400]
}

In [None]:
LogisticRegression_grid = RandomizedSearchCV(LogisticRegression(verbose=1, n_jobs=-1), param_dist_LR, cv=5, verbose=2, \
                                       scoring='accuracy', n_iter=50, n_jobs=-1, random_state=42)
LogisticRegression_grid.fit(X_train, y_train)



Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 28 epochs took 12 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   12.4s finished


RandomizedSearchCV(cv=5, estimator=LogisticRegression(n_jobs=-1, verbose=1),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'max_iter': [100, 200, 400],
                                        'solver': ['lbfgs', 'sag']},
                   random_state=42, scoring='accuracy', verbose=2)

In [None]:
print(f'Best score: {LogisticRegression_grid.best_score_}')
print(f'Parameters: {LogisticRegression_grid.best_params_}')

Best score: 0.7598750501604647
Parameters: {'solver': 'sag', 'max_iter': 100}


In [None]:
LogisticRegression_clf = LogisticRegression(max_iter=100, solver='sag')
LogisticRegression_clf.fit(X_train, y_train)

LogisticRegression(solver='sag')

In [None]:
preds = LogisticRegression_clf.predict(X_test)
metrics_report(y_test, preds)

Accuracy : 0.7589058992834024
Macro-average quality numbers
Precision: 0.7644, Recall: 0.7589, F1-measure: 0.7577
              precision    recall  f1-score   support

           0       0.80      0.69      0.74    101844
           1       0.73      0.83      0.77    101757

    accuracy                           0.76    203601
   macro avg       0.76      0.76      0.76    203601
weighted avg       0.76      0.76      0.76    203601

[[69978 31866]
 [17221 84536]]


In [None]:
dump(LogisticRegression_grid, '/content/gdrive/MyDrive/Models_DGA/LogisticRegression_grid.joblib')
dump(LogisticRegression_clf, '/content/gdrive/MyDrive/Models_DGA/LogisticRegression_clf.joblib')

['/content/gdrive/MyDrive/Models_DGA/LogisticRegression_clf.joblib']