In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read the CSV file into a dataframe
df = pd.read_csv('/content/drive/MyDrive/dga_dataset/dataset_balanced.csv')
print(df)

                           domTLD domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  \
0                         tavriav     ua  10.0   7.0   2.0   0.0   2.0  0.0   
1        fabulouseyebrowthreading    com  28.0  24.0   3.0   0.0   2.0  0.0   
2                        koutakia     gr  11.0   8.0   2.0   0.0   2.0  0.0   
3                 megatvonlinevip    com  19.0  15.0   3.0   0.0   2.0  0.0   
4                           roars     it   8.0   5.0   2.0   0.0   2.0  0.0   
...                           ...    ...   ...   ...   ...   ...   ...  ...   
1887161                iqsyrteiyy    com  14.0  10.0   3.0   0.0   2.0  0.0   
1887162              acfhjlakhnsm  bazar  18.0  12.0   5.0   0.0   2.0  0.0   
1887163                    cenada    biz  10.0   6.0   3.0   0.0   2.0  0.0   
1887164        typesnamesthiswere    com  22.0  18.0   3.0   0.0   2.0  0.0   
1887165                61d94e4f9f    net  14.0  10.0   3.0   0.0   2.0  0.0   

         HIP  LCc  ...    RAv    RAl  RAn    RAs   

In [None]:
# Encode categorical columns
le_domTLD = LabelEncoder()
le_domSLD = LabelEncoder()
le_domTLD.fit(df['domTLD'])
le_domSLD.fit(df['domSLD'])
df['domTLD'] = le_domTLD.transform(df['domTLD'])
df['domSLD'] = le_domSLD.transform(df['domSLD'])
print(df)

          domTLD  domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  HIP  LCc  ...  \
0        1396123     703  10.0   7.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
1         515674     153  28.0  24.0   3.0   0.0   2.0  0.0  0.0  4.0  ...   
2         823672     295  11.0   8.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
3         930963     153  19.0  15.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
4        1253406     363   8.0   5.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
...          ...     ...   ...   ...   ...   ...   ...  ...  ...  ...  ...   
1887161   734762     153  14.0  10.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
1887162    99251      54  18.0  12.0   5.0   0.0   2.0  0.0  0.0  5.0  ...   
1887163   297308      71  10.0   6.0   3.0   0.0   2.0  0.0  0.0  1.0  ...   
1887164  1476695     153  22.0  18.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
1887165    51818     478  14.0  10.0   3.0   0.0   2.0  0.0  0.0  1.0  ...   

           RAv    RAl  RAn    RAs     Rc     Rv     Rl     Rn  

In [None]:
# Save LabelEncoder objects and scaler object
with open('/content/drive/MyDrive/Colab Notebooks/XGBoosT/le_domTLD.pkl', 'wb') as f:
    pickle.dump(le_domTLD, f)

with open('/content/drive/MyDrive/Colab Notebooks/XGBoosT/le_domSLD.pkl', 'wb') as f:
    pickle.dump(le_domSLD, f)

In [None]:
# Shuffle the rows in the dataframe
df = df.sample(frac=1).reset_index(drop=True)
print(df)

          domTLD  domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  HIP  LCc  ...  \
0         830473     506  19.0  15.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
1         382711     478  31.0   4.0   3.0  22.0   3.0  0.0  0.0  4.0  ...   
2        1262339     579  17.0  14.0   2.0   0.0   2.0  0.0  0.0  4.0  ...   
3        1190279     223  14.0  11.0   2.0   0.0   2.0  0.0  0.0  1.0  ...   
4         328242     783  13.0   2.0   2.0   7.0   3.0  0.0  0.0  1.0  ...   
...          ...     ...   ...   ...   ...   ...   ...  ...  ...  ...  ...   
1887161  1648445     153  16.0  12.0   3.0   0.0   2.0  0.0  0.0  7.0  ...   
1887162   186999     153  15.0  11.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
1887163   167966     153  10.0   6.0   3.0   0.0   2.0  0.0  0.0  1.0  ...   
1887164  1351030     153  12.0   8.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
1887165  1512237     346  12.0   7.0   4.0   0.0   2.0  0.0  0.0  5.0  ...   

           RAv    RAl  RAn    RAs     Rc     Rv     Rl     Rn  

In [None]:
y = df[['label']].values
del df['label']
del df['HIP']

In [None]:
# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to your dataset
scaler.fit(df)

# Transform your dataset using the scaler
df_norm = scaler.transform(df)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/XGBoosT/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
print(df_norm.shape)

(1887166, 35)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_norm, y,stratify=y, test_size=0.2, random_state=42)

In [None]:
# define the XGBoost model
xgb_model = xgb.XGBClassifier(max_depth=10, gamma=0.3 ,learning_rate=0.4,n_estimators=1000,objective='binary:logistic')

# train the model on the training data
xgb_model.fit(X_train, y_train,verbose=True,early_stopping_rounds=10, eval_metric='aucpr',eval_set=[(X_test,y_test)])



[0]	validation_0-aucpr:0.94200
[1]	validation_0-aucpr:0.94628
[2]	validation_0-aucpr:0.95185
[3]	validation_0-aucpr:0.95298
[4]	validation_0-aucpr:0.95588
[5]	validation_0-aucpr:0.95788
[6]	validation_0-aucpr:0.95873
[7]	validation_0-aucpr:0.95980
[8]	validation_0-aucpr:0.96057
[9]	validation_0-aucpr:0.96155
[10]	validation_0-aucpr:0.96196
[11]	validation_0-aucpr:0.96366
[12]	validation_0-aucpr:0.96455
[13]	validation_0-aucpr:0.96521
[14]	validation_0-aucpr:0.96650
[15]	validation_0-aucpr:0.96774
[16]	validation_0-aucpr:0.96858
[17]	validation_0-aucpr:0.96936
[18]	validation_0-aucpr:0.97014
[19]	validation_0-aucpr:0.97062
[20]	validation_0-aucpr:0.97082
[21]	validation_0-aucpr:0.97161
[22]	validation_0-aucpr:0.97198
[23]	validation_0-aucpr:0.97208
[24]	validation_0-aucpr:0.97230
[25]	validation_0-aucpr:0.97279
[26]	validation_0-aucpr:0.97308
[27]	validation_0-aucpr:0.97315
[28]	validation_0-aucpr:0.97322
[29]	validation_0-aucpr:0.97371
[30]	validation_0-aucpr:0.97400
[31]	validation_0-

In [None]:
# Predict on the training and validation sets
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate the training and validation accuracies
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("Training accuracy:", train_acc)
print("Validation accuracy:", test_acc)

Training accuracy: 0.9710922203410937
Validation accuracy: 0.9397219116454797


In [None]:
# evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
from sklearn.metrics import average_precision_score
# make predictions on the test set and calculate AUC-PR
auc_pr = average_precision_score(y_test, y_pred)
print('AUC-PR:', auc_pr)

Accuracy: 0.9397219116454797
Precision: 0.9398011443876307
Recall: 0.9397219116454797
F1 Score: 0.9397191966608733
AUC-PR: 0.9158468804675903


In [None]:
# make predictions on the testing data
y_pred = xgb_model.predict([X_test[13900]])
print(y_pred , y_test[13900])

[0] [0]


In [None]:
from matplotlib import pyplot as plt
# plot the first decision tree
fig, ax = plt.subplots(figsize=(100, 100), dpi=300)
xgb.plot_tree(xgb_model, ax=ax, num_trees=0)
plt.savefig("tree.png", dpi=300)
plt.show()

In [None]:
import pickle
# Save the model
with open('/content/drive/MyDrive/Colab Notebooks/XGBoosT/xgboost_model_v3.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)