In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
df = pd.read_csv(r"C:\Users\vijet\Desktop\BenignAndMaliciousDataset.csv")

In [3]:
df.nunique(axis=0)

Domain                 90000
DNSRecordType              3
MXDnsResponse              2
TXTDnsResponse             2
HasSPFInfo                 2
HasDkimInfo                2
HasDmarcInfo               2
Ip                     16985
DomainInAlexaDB            2
CommonPorts                2
CountryCode              108
RegisteredCountry        113
CreationDate               5
LastUpdateDate             5
ASN                     1976
HttpResponseCode           4
RegisteredOrg           4377
SubdomainNumber          207
Entropy                    6
EntropyOfSubDomains        4
StrangeCharacters         47
TLD                      756
IpReputation               2
DomainReputation           2
ConsoantRatio             11
NumericRatio               9
SpecialCharRatio          10
VowelRatio                 9
ConsoantSequence          25
VowelSequence              8
NumericSequence           26
SpecialCharSequence        9
DomainLength             150
Class                      2
dtype: int64

In [4]:
# Check for null values in each column
null_values_in_columns = df.isnull().sum()
null_values_in_columns

Domain                     0
DNSRecordType              0
MXDnsResponse              0
TXTDnsResponse             0
HasSPFInfo                 0
HasDkimInfo                0
HasDmarcInfo               0
Ip                         0
DomainInAlexaDB            0
CommonPorts                0
CountryCode            29052
RegisteredCountry      77774
CreationDate               0
LastUpdateDate             0
ASN                        0
HttpResponseCode           0
RegisteredOrg          35391
SubdomainNumber            0
Entropy                    0
EntropyOfSubDomains        0
StrangeCharacters          0
TLD                      170
IpReputation               0
DomainReputation           0
ConsoantRatio              0
NumericRatio               0
SpecialCharRatio           0
VowelRatio                 0
ConsoantSequence           0
VowelSequence              0
NumericSequence            0
SpecialCharSequence        0
DomainLength               0
Class                      0
dtype: int64

In [5]:
df=df.drop(columns=['Domain','CountryCode','RegisteredCountry','RegisteredOrg','DNSRecordType','Ip'])

In [6]:
# Check for null values in each column
null_values_in_columns = df.isnull().sum()
null_values_in_columns

MXDnsResponse            0
TXTDnsResponse           0
HasSPFInfo               0
HasDkimInfo              0
HasDmarcInfo             0
DomainInAlexaDB          0
CommonPorts              0
CreationDate             0
LastUpdateDate           0
ASN                      0
HttpResponseCode         0
SubdomainNumber          0
Entropy                  0
EntropyOfSubDomains      0
StrangeCharacters        0
TLD                    170
IpReputation             0
DomainReputation         0
ConsoantRatio            0
NumericRatio             0
SpecialCharRatio         0
VowelRatio               0
ConsoantSequence         0
VowelSequence            0
NumericSequence          0
SpecialCharSequence      0
DomainLength             0
Class                    0
dtype: int64

In [7]:
df.dropna(subset=['TLD'], inplace=True)

In [8]:
# Check for null values in each column
null_values_in_columns = df.isnull().sum()
null_values_in_columns

MXDnsResponse          0
TXTDnsResponse         0
HasSPFInfo             0
HasDkimInfo            0
HasDmarcInfo           0
DomainInAlexaDB        0
CommonPorts            0
CreationDate           0
LastUpdateDate         0
ASN                    0
HttpResponseCode       0
SubdomainNumber        0
Entropy                0
EntropyOfSubDomains    0
StrangeCharacters      0
TLD                    0
IpReputation           0
DomainReputation       0
ConsoantRatio          0
NumericRatio           0
SpecialCharRatio       0
VowelRatio             0
ConsoantSequence       0
VowelSequence          0
NumericSequence        0
SpecialCharSequence    0
DomainLength           0
Class                  0
dtype: int64

In [9]:
df.shape

(89830, 28)

In [10]:
df.head()

Unnamed: 0,MXDnsResponse,TXTDnsResponse,HasSPFInfo,HasDkimInfo,HasDmarcInfo,DomainInAlexaDB,CommonPorts,CreationDate,LastUpdateDate,ASN,...,ConsoantRatio,NumericRatio,SpecialCharRatio,VowelRatio,ConsoantSequence,VowelSequence,NumericSequence,SpecialCharSequence,DomainLength,Class
0,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.1,0.0,0.2,10,4,1,0,134,1
1,False,False,False,False,False,False,False,0,0,-1,...,0.7,0.1,0.0,0.2,12,2,2,1,123,1
2,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.2,0.0,0.2,16,2,3,0,150,1
3,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.1,0.0,0.1,9,1,2,0,122,1
4,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.2,0.0,0.1,10,2,2,0,151,1


In [11]:
df.dtypes

MXDnsResponse             bool
TXTDnsResponse            bool
HasSPFInfo                bool
HasDkimInfo               bool
HasDmarcInfo              bool
DomainInAlexaDB           bool
CommonPorts               bool
CreationDate             int64
LastUpdateDate           int64
ASN                      int64
HttpResponseCode         int64
SubdomainNumber          int64
Entropy                  int64
EntropyOfSubDomains      int64
StrangeCharacters        int64
TLD                     object
IpReputation              bool
DomainReputation          bool
ConsoantRatio          float64
NumericRatio           float64
SpecialCharRatio       float64
VowelRatio             float64
ConsoantSequence         int64
VowelSequence            int64
NumericSequence          int64
SpecialCharSequence      int64
DomainLength             int64
Class                    int64
dtype: object

In [12]:
# Separate features (X) and target variable (y)
X = df.drop(columns=['Class','TLD'])  # Exclude non-numeric and irrelevant features
y = df['Class']

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [15]:
# Train a Random Forest classifier to determine feature importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [16]:
# Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)

In [17]:
feature_importances

MXDnsResponse          0.004436
TXTDnsResponse         0.009213
HasSPFInfo             0.028289
HasDkimInfo            0.000042
HasDmarcInfo           0.000653
DomainInAlexaDB        0.001591
CommonPorts            0.003141
CreationDate           0.012040
LastUpdateDate         0.013155
ASN                    0.038637
HttpResponseCode       0.008311
SubdomainNumber        0.053827
Entropy                0.007493
EntropyOfSubDomains    0.000103
StrangeCharacters      0.126183
IpReputation           0.001108
DomainReputation       0.000185
ConsoantRatio          0.094632
NumericRatio           0.231906
SpecialCharRatio       0.001875
VowelRatio             0.072527
ConsoantSequence       0.013912
VowelSequence          0.007916
NumericSequence        0.202205
SpecialCharSequence    0.001799
DomainLength           0.064821
dtype: float64

In [18]:
# Select top k features based on importance scores
k = 10  # You can adjust this parameter
selected_features = feature_importances.nlargest(k).index

In [19]:
selected_features

Index(['NumericRatio', 'NumericSequence', 'StrangeCharacters', 'ConsoantRatio',
       'VowelRatio', 'DomainLength', 'SubdomainNumber', 'ASN', 'HasSPFInfo',
       'ConsoantSequence'],
      dtype='object')

In [20]:
X_train[selected_features]

Unnamed: 0,NumericRatio,NumericSequence,StrangeCharacters,ConsoantRatio,VowelRatio,DomainLength,SubdomainNumber,ASN,HasSPFInfo,ConsoantSequence
81935,0.0,0,0,0.6,0.3,20,0,-1,False,3
66051,0.3,3,6,0.3,0.2,30,57,26228,True,2
61172,0.3,3,6,0.3,0.2,30,57,26228,True,2
89049,0.2,2,18,0.6,0.1,132,0,-1,False,9
9522,0.0,0,0,0.5,0.4,13,0,46606,False,2
...,...,...,...,...,...,...,...,...,...,...
6267,0.3,3,7,0.3,0.2,31,57,26228,True,2
54993,0.0,0,0,0.5,0.5,24,0,-1,False,2
76987,0.0,0,0,0.6,0.3,24,0,-1,False,3
860,0.0,0,0,0.6,0.2,13,0,46606,False,2


In [21]:
# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier()
}


In [22]:
from tabulate import tabulate

# Initialize an empty list to store results
results = []

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    # Train the classifier with the final selected features
    clf.fit(X_train[selected_features], y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test[selected_features])

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append results to the list
    results.append([clf_name, accuracy, precision, recall, f1])

# Print results in tabular format
print(tabulate(results, headers=["Classifier", "Accuracy", "Precision", "Recall", "F1-score"]))


Classifier                Accuracy    Precision    Recall    F1-score
----------------------  ----------  -----------  --------  ----------
Random Forest             0.981966     0.979291  0.984932    0.982103
Support Vector Machine    0.799733     0.842245  0.739973    0.787804
Gradient Boosting         0.978515     0.976295  0.981055    0.978669
K-Nearest Neighbors       0.96293      0.978261  0.947263    0.962513
Decision Tree             0.97768      0.979646  0.975848    0.977743
Naive Bayes               0.885617     0.916179  0.8501      0.881903
AdaBoost                  0.969888     0.970187  0.969865    0.970026


## Hyper Parameter

In [23]:
df

Unnamed: 0,MXDnsResponse,TXTDnsResponse,HasSPFInfo,HasDkimInfo,HasDmarcInfo,DomainInAlexaDB,CommonPorts,CreationDate,LastUpdateDate,ASN,...,ConsoantRatio,NumericRatio,SpecialCharRatio,VowelRatio,ConsoantSequence,VowelSequence,NumericSequence,SpecialCharSequence,DomainLength,Class
0,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.1,0.0,0.2,10,4,1,0,134,1
1,False,False,False,False,False,False,False,0,0,-1,...,0.7,0.1,0.0,0.2,12,2,2,1,123,1
2,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.2,0.0,0.2,16,2,3,0,150,1
3,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.1,0.0,0.1,9,1,2,0,122,1
4,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.2,0.0,0.1,10,2,2,0,151,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,False,False,False,False,False,False,False,0,0,-1,...,0.5,0.2,0.0,0.2,4,1,2,0,27,0
89996,False,False,False,False,False,False,False,0,0,-1,...,0.6,0.0,0.0,0.4,3,2,0,0,26,0
89997,False,False,False,False,False,False,False,0,0,-1,...,0.5,0.1,0.0,0.4,3,2,2,0,31,0
89998,True,True,True,False,False,False,False,4,4,46606,...,0.5,0.0,0.0,0.4,4,2,0,0,11,0


In [24]:
# Define classifiers with hyperparameter grids
classifiers = {
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5]}),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),
    'AdaBoost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5]})
}


In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Create an empty list to store results
results = []

# Iterate over classifiers
for clf_name, (clf, param_grid) in classifiers.items():
    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train[selected_features], y_train)
    
    # Get the best model and its parameters
    best_clf = grid_search.best_estimator_
    best_params = grid_search.best_params_
     Train the best model on the full training set
    best_clf.fit(X_train[selected_features], y_train)
    
    # Evaluate the model on the test set
    y_pred = best_clf.predict(X_test[selected_features])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    
    # Append results to the list
    results.append([clf_name, best_params, accuracy, precision, recall, f1])

# Print results in tabular format
headers = ['Classifier', 'Best Parameters', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
print(tabulate(results, headers=headers, tablefmt='grid'))


KeyboardInterrupt: 

In [25]:
from sklearn.model_selection import GridSearchCV

# Create an empty dictionary to store best parameters
best_params_dict = {}

# Iterate over classifiers
for clf_name, (clf, param_grid) in classifiers.items():
    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train[selected_features], y_train)
    
    # Get the best parameters and store them in the dictionary
    best_params_dict[clf_name] = grid_search.best_params_

# Display the best parameters
best_params_dict


{'Random Forest': {'max_depth': 20, 'n_estimators': 150},
 'Gradient Boosting': {'learning_rate': 0.5, 'n_estimators': 150},
 'K-Nearest Neighbors': {'n_neighbors': 3, 'weights': 'distance'},
 'Decision Tree': {'max_depth': 10, 'min_samples_split': 5},
 'AdaBoost': {'learning_rate': 0.5, 'n_estimators': 150}}

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create an empty list to store results
results = []

# Iterate over classifiers
for clf_name, (clf, param_grid) in classifiers.items():
    # Get the best parameters for the current classifier
    best_params = best_params_dict[clf_name]
    
    # Create the best model with the best parameters
    best_clf = clf.set_params(**best_params)
    
    # Train the best model on the full training set
    best_clf.fit(X_train[selected_features], y_train)
    
    # Evaluate the model on the test set
    y_pred = best_clf.predict(X_test[selected_features])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append results to the list
    results.append([clf_name, best_params, accuracy, precision, recall, f1])

# Print results in tabular format
headers = ['Classifier', 'Best Parameters', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
print(tabulate(results, headers=headers, tablefmt='grid'))


+---------------------+---------------------------------------------+------------+-------------+----------+------------+
| Classifier          | Best Parameters                             |   Accuracy |   Precision |   Recall |   F1-Score |
| Random Forest       | {'max_depth': 20, 'n_estimators': 150}      |   0.983469 |    0.979563 | 0.987702 |   0.983616 |
+---------------------+---------------------------------------------+------------+-------------+----------+------------+
| Gradient Boosting   | {'learning_rate': 0.5, 'n_estimators': 150} |   0.98269  |    0.979637 | 0.98604  |   0.982828 |
+---------------------+---------------------------------------------+------------+-------------+----------+------------+
| K-Nearest Neighbors | {'n_neighbors': 3, 'weights': 'distance'}   |   0.965824 |    0.974718 | 0.956791 |   0.965671 |
+---------------------+---------------------------------------------+------------+-------------+----------+------------+
| Decision Tree       | {'max_de