In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
# Load the dataset
df = pd.read_csv(r'E:\BenignAndMaliciousDataset.csv')


In [18]:
df=df.drop(columns=['Domain','CountryCode','RegisteredCountry','RegisteredOrg','DNSRecordType','Ip'])

In [19]:
df.dropna(subset=['TLD'], inplace=True)

In [20]:
# Check for null values in each column
null_values_in_columns = df.isnull().sum()
null_values_in_columns

MXDnsResponse          0
TXTDnsResponse         0
HasSPFInfo             0
HasDkimInfo            0
HasDmarcInfo           0
DomainInAlexaDB        0
CommonPorts            0
CreationDate           0
LastUpdateDate         0
ASN                    0
HttpResponseCode       0
SubdomainNumber        0
Entropy                0
EntropyOfSubDomains    0
StrangeCharacters      0
TLD                    0
IpReputation           0
DomainReputation       0
ConsoantRatio          0
NumericRatio           0
SpecialCharRatio       0
VowelRatio             0
ConsoantSequence       0
VowelSequence          0
NumericSequence        0
SpecialCharSequence    0
DomainLength           0
Class                  0
dtype: int64

In [21]:
df.dtypes

MXDnsResponse             bool
TXTDnsResponse            bool
HasSPFInfo                bool
HasDkimInfo               bool
HasDmarcInfo              bool
DomainInAlexaDB           bool
CommonPorts               bool
CreationDate             int64
LastUpdateDate           int64
ASN                      int64
HttpResponseCode         int64
SubdomainNumber          int64
Entropy                  int64
EntropyOfSubDomains      int64
StrangeCharacters        int64
TLD                     object
IpReputation              bool
DomainReputation          bool
ConsoantRatio          float64
NumericRatio           float64
SpecialCharRatio       float64
VowelRatio             float64
ConsoantSequence         int64
VowelSequence            int64
NumericSequence          int64
SpecialCharSequence      int64
DomainLength             int64
Class                    int64
dtype: object

In [22]:
# Separate features (X) and target variable (y)
X = df.drop(columns=['Class', 'TLD'])  # Exclude non-numeric and irrelevant features
y = df['Class']


In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Feature Selection: Select top k features using ANOVA F-value
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_features = X.columns[selector.get_support()]


In [25]:
X_train_selected

array([[ 0. ,  0. ,  0. , ...,  0.3,  0. , 20. ],
       [ 1. ,  1. ,  4. , ...,  0.2,  3. , 30. ],
       [ 1. ,  1. ,  4. , ...,  0.2,  3. , 30. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  0.3,  0. , 24. ],
       [ 0. ,  0. ,  4. , ...,  0.2,  0. , 13. ],
       [ 0. ,  0. ,  0. , ...,  0.3,  2. , 13. ]])

In [26]:
X_train[selected_features]

Unnamed: 0,TXTDnsResponse,HasSPFInfo,CreationDate,LastUpdateDate,StrangeCharacters,ConsoantRatio,NumericRatio,VowelRatio,NumericSequence,DomainLength
81935,False,False,0,0,0,0.6,0.0,0.3,0,20
66051,True,True,4,4,6,0.3,0.3,0.2,3,30
61172,True,True,4,4,6,0.3,0.3,0.2,3,30
89049,False,False,0,0,18,0.6,0.2,0.1,2,132
9522,False,False,4,4,0,0.5,0.0,0.4,0,13
...,...,...,...,...,...,...,...,...,...,...
6267,True,True,4,4,7,0.3,0.3,0.2,3,31
54993,False,False,0,0,0,0.5,0.0,0.5,0,24
76987,False,False,0,0,0,0.6,0.0,0.3,0,24
860,False,False,4,4,0,0.6,0.0,0.2,0,13


In [27]:
selected_features

Index(['TXTDnsResponse', 'HasSPFInfo', 'CreationDate', 'LastUpdateDate',
       'StrangeCharacters', 'ConsoantRatio', 'NumericRatio', 'VowelRatio',
       'NumericSequence', 'DomainLength'],
      dtype='object')

In [28]:
# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier()
}


In [29]:
from tabulate import tabulate
from time import time

# Initialize an empty list to store results
results = []

# Initialize an empty dictionary to store times
times = {}

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    start_time = time()  # Start time for the current classifier
    # Train the classifier with the final selected features
    clf.fit(X_train[selected_features], y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test[selected_features])

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    end_time = time()  # End time for the current classifier
    elapsed_time = end_time - start_time  # Calculate elapsed time

    # Append results to the list
    results.append([clf_name, accuracy, precision, recall, f1, elapsed_time])
    times[clf_name] = elapsed_time  # Store elapsed time for the current classifier

# Print results in tabular format with github styling
print(tabulate(results, headers=["Algorithm", "Accuracy", "Precision", "Recall", "F1 Score", "Time (sec)"], tablefmt="github"))


| Algorithm              |   Accuracy |   Precision |   Recall |   F1 Score |   Time (sec) |
|------------------------|------------|-------------|----------|------------|--------------|
| Random Forest          |   0.97729  |    0.97687  | 0.977953 |   0.977411 |    2.54406   |
| Support Vector Machine |   0.952354 |    0.953486 | 0.951584 |   0.952534 |   74.4213    |
| Gradient Boosting      |   0.976344 |    0.975456 | 0.977509 |   0.976482 |    3.10277   |
| K-Nearest Neighbors    |   0.976122 |    0.97555  | 0.976955 |   0.976252 |    1.67654   |
| Decision Tree          |   0.974062 |    0.976615 | 0.971637 |   0.97412  |    0.0867608 |
| Naive Bayes            |   0.90816  |    0.952293 | 0.860292 |   0.903958 |    0.0488694 |
| AdaBoost               |   0.969999 |    0.971134 | 0.969089 |   0.97011  |    1.39383   |


In [30]:
# Define classifiers with hyperparameter grids
classifiers = {
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5]}),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),
    'AdaBoost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5]})
}


In [32]:
from sklearn.model_selection import GridSearchCV

# Create an empty dictionary to store best parameters
best_params_dict = {}

# Iterate over classifiers
for clf_name, (clf, param_grid) in classifiers.items():
    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train[selected_features], y_train)
    
    # Get the best parameters and store them in the dictionary
    best_params_dict[clf_name] = grid_search.best_params_

# Display the best parameters
best_params_dict


{'Random Forest': {'max_depth': 30, 'n_estimators': 50},
 'Gradient Boosting': {'learning_rate': 0.5, 'n_estimators': 50},
 'K-Nearest Neighbors': {'n_neighbors': 3, 'weights': 'uniform'},
 'Decision Tree': {'max_depth': 10, 'min_samples_split': 2},
 'AdaBoost': {'learning_rate': 0.5, 'n_estimators': 150}}

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate
import time

# Create an empty list to store results
results = []

# Iterate over classifiers
for clf_name, (clf, param_grid) in classifiers.items():
    start_time = time.time()  # Start time
    
    # Get the best parameters for the current classifier
    best_params = best_params_dict[clf_name]
    
    # Create the best model with the best parameters
    best_clf = clf.set_params(**best_params)
    
    # Train the best model on the full training set
    best_clf.fit(X_train[selected_features], y_train)
    
    # Evaluate the model on the test set
    y_pred = best_clf.predict(X_test[selected_features])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    end_time = time.time()  # End time
    elapsed_time = end_time - start_time  # Calculate elapsed time
    
    # Append results to the list
    results.append([clf_name, accuracy, precision, recall, f1, elapsed_time])

# Print results in tabular format
headers = ['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (sec)']
print(tabulate(results, headers=headers, tablefmt='grid'))


+---------------------+------------+-------------+----------+------------+--------------+
| Algorithm           |   Accuracy |   Precision |   Recall |   F1 Score |   Time (sec) |
| Random Forest       |   0.977235 |    0.976868 | 0.977842 |   0.977355 |      1.24546 |
+---------------------+------------+-------------+----------+------------+--------------+
| Gradient Boosting   |   0.977402 |    0.97373  | 0.981498 |   0.977599 |      1.62963 |
+---------------------+------------+-------------+----------+------------+--------------+
| K-Nearest Neighbors |   0.974953 |    0.974441 | 0.975737 |   0.975089 |      1.76791 |
+---------------------+------------+-------------+----------+------------+--------------+
| Decision Tree       |   0.977402 |    0.974356 | 0.980833 |   0.977584 |      0.06095 |
+---------------------+------------+-------------+----------+------------+--------------+
| AdaBoost            |   0.968218 |    0.965532 | 0.971416 |   0.968465 |      4.20593 |
+---------