# Hello ✨
### This notebook prepares the dataset from the "UTL_DGA22" original data for dga detection
### The "DGA_Botnets_Domains" folder contains 77 .txt files , each file conatains domain names generated by a certain dga family the 77th file'legit-1000000.txt' that stores 1000,000 benign domains powered by Alexa.
### For each domain name we extract 36 features using extract_domain_features() function , then put all the daomain names in a single dtaframe , clean the data .
### After this 4 .csv files are generated


*   **dataset.csv** : file contains the original data with the two first features not encoded (strings )
*   **dataset_balanced.csv** : file contains the data with the two first features not encoded (strings ) , and the data is balenced => benign domains = dga domains
*   **dataset_num.csv** : file contains the data with the two first features encoded    
*   **dataset_num_balanced.csv** : file contains the data with the two first features encoded , and the data is balenced => benign domains = dga domains

You can find the files in "Project_2CS/DATASET/Extracted" folder





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
import numpy as np
import re
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Set the path to the directory containing the files
files_dir = '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains'

In [None]:
# Get a list of all the txt files in the directory
txt_files = [os.path.join(files_dir, f) for f in os.listdir(files_dir) if f.endswith('.txt')]

In [None]:
print(txt_files)

['/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/legit-1000000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bamital-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/banjori-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bazarbackdoor-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bazarbackdoor_v3-20000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bazarbackdoor_v2-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bedep-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/bigviktor-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/ccleaner-10000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/chinad-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/corebot-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/cryptolocker-50000.txt', '/content/drive/MyDrive/dga_dataset/DGA_Botnets_Domains/di

In [None]:
def extract_domain_features(domain_name):
    domain_name = domain_name.strip().lower()
    if not domain_name:
        return [0]
    try:
            tld, sld = domain_name.split(".")[-2:]
    except ValueError:
        print(f"Skipping invalid domain name: {domain_name}")
        return [0]  # Skip invalid domain names

    # Calculate lengths of top-level domain and second-level domain
    tld_length = len(tld)
    sld_length = len(sld)

    # Calculate length and level of the domain name
    domain_length = len(domain_name)
    domain_level = len(domain_name.split("."))
    other_level = max(domain_length - (tld_length + sld_length + 2),0)


    # Check if domain name includes "www" or an IP address
    includes_www = int(domain_name.startswith("www."))
    includes_ip = int(bool(re.match(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain_name)))

     # Calculate length of longest consonant string
    consonant_string = re.sub("[aeiouyAEIOUY\W\d]+", " ", domain_name)
    consonant_lengths = [len(word) for word in consonant_string.split()]
    longest_consonant = max(consonant_lengths, default=0)
    shortest_consonant =  min(consonant_lengths) if len(consonant_lengths) > 1 else 0
    diff_consonant = longest_consonant - shortest_consonant

    # Calculate length of longest vowel sequence
    vowel_string = re.sub("[^aeiouyAEIOUY]+", " ", domain_name)
    vowel_lengths = [len(word) for word in vowel_string.split()]
    longest_vowel = max(vowel_lengths, default=0)
    shortest_vowel =  min(vowel_lengths) if len(vowel_lengths) > 1 else 0
    diff_vowel = longest_vowel - shortest_vowel

    # Calculate length of longest character string
    char_lengths = [len(word) for word in re.findall(r"[a-zA-Z]+", domain_name)]
    longest_char = max(char_lengths, default=0)
    shortest_char =  min(char_lengths) if len(char_lengths) > 1 else 0
    diff_char = longest_char - shortest_char

    # Calculate length of longest string of numbers
    num_lengths = [len(word) for word in re.findall(r"\d+", domain_name)]
    longest_num = max(num_lengths, default=0)
    shortest_num = min(num_lengths) if len(num_lengths) > 1 else 0
    diff_num = longest_num - shortest_num

     # Calculate average length of consonant strings
    avg_consonant = sum(consonant_lengths) / len(consonant_lengths) if len(consonant_lengths) > 0 else 0

    # Calculate average length of vowel strings
    avg_vowel = sum(vowel_lengths) / len(vowel_lengths) if len(vowel_lengths) > 0 else 0

    # Calculate average length of character strings
    avg_char = sum(char_lengths) / len(char_lengths) if len(char_lengths) > 0 else 0

    # Calculate average length of alphanumeric strings
    avg_alnum = sum(num_lengths) / len(num_lengths) if len(num_lengths) > 0 else 0

    # Calculate the number of different consonants in the domain name
    consonants = set(re.sub("[aeiouyAEIOUY\W\d]+", "", domain_name))
    num_consonants = len(consonants)
    pct_consonants = round(num_consonants / 20 , 3)

    num_cons = len(re.findall(r"[bcdfghjklmnpqrstvwxz]", domain_name, re.IGNORECASE))
    ratio_consonants = round(num_cons / domain_length , 3)

    # Calculate the number of different vowels in the domain name
    vowels = set(re.sub("[^aeiouyAEIOUY]+", "", domain_name))
    num_vowels = len(vowels)
    pct_vowels = round(num_vowels / 6 , 3)

    num_vow = len(re.findall(r"[aeiouy]", domain_name, re.IGNORECASE))
    ratio_vowels = round(num_vow / domain_length , 3)

    # Calculate the number of different letters in the domain name
    num_letters = num_consonants + num_vowels
    pct_letters = round(num_letters / 26 , 3)

    num_alpha = len(re.findall(r"[a-z]", domain_name, re.IGNORECASE))
    ratio_letters = round(num_alpha / domain_length , 3)

    # Calculate the number of different numbers in the domain name
    numbers = set(re.findall(r"\d+", domain_name))
    num_numbers = len(numbers)
    pct_numbers = round(num_numbers / 10 , 3)

    num_num = len(re.findall(r"\d", domain_name))
    ratio_numbers = round(num_num / domain_length , 3)

    # Calculate the number of different special characters in the domain name
    #punctuation marks, symbols, whitespace, and underscores
    special_chars = set(re.findall(r"[\W_]+", domain_name))
    num_special_chars = len(special_chars)
    #There are 32 special characters in total !, @, #, $, %, ^, &, *, (, ), -, _, +, =, {, }, [, ], |, \, ;, :, ", ', ,, ., <, >, ?, /, ~, " ".
    pct_special_chars = round(num_special_chars / 32 , 3)

    num_sc = len(re.findall(r"[\W_]", domain_name))
    ratio_special_chars = round(num_sc / domain_length , 3)

    return  [tld, sld, domain_length ,tld_length, sld_length ,other_level,domain_level,includes_www ,includes_ip ,longest_consonant , longest_vowel, longest_char, longest_num , avg_consonant, avg_vowel, avg_char, avg_alnum , diff_consonant, diff_vowel, diff_char,diff_num , num_consonants , num_vowels, num_letters, num_numbers, num_special_chars,pct_consonants,pct_vowels,pct_letters,pct_numbers,pct_special_chars,ratio_consonants,ratio_vowels,ratio_letters,ratio_numbers,ratio_special_chars]

In [None]:
data_row = extract_domain_features("ojhpiwdmmyxneo88.com")
print(data_row)


['ojhpiwdmmyxneo88', 'com', 20, 16, 3, 0, 2, 0, 0, 4, 2, 14, 2, 2.2, 1.2, 8.5, 2.0, 3, 1, 11, 2, 9, 4, 13, 1, 1, 0.45, 0.667, 0.5, 0.1, 0.031, 0.55, 0.3, 0.85, 0.1, 0.05]


In [None]:
# Read each txt file into a dataframe and append it to a list
dfs = []

for path in txt_files:
  # Open the text file
 with open(path , 'r') as file:
    # Determine the label for this file
    label = 0 if os.path.basename(path) == 'legit-1000000.txt' else 1
    # Read the contents of the file and split into individual lines
    lines = file.read().split('\n')

    # Split each domain name into TLD and SLD
    data = []
    for line in lines:
        data_row = extract_domain_features(line)
        data.append(data_row)

    # Convert the list of TLD/SLD pairs into a Pandas dataframe
    df = pd.DataFrame(data, columns=['domTLD', 'domSLD', 'Ldom', 'LTLD', 'LSLD' ,"LOLD","Ddom" , "HwP" , "HIP" , "LCc", "LCv", "LCl" , "LCn", "ACc","ACv","ACl","ACn","DCc","DCv","DCl","DCn","NCc","NCv","NCl","NCn","NCs","RAc","RAv","RAl","RAn","RAs","Rc","Rv","Rl","Rn","Rs"])

        # Add your additional features here

        # Add the label column
    df['label'] = label

        # Append the dataframe to the list
    dfs.append(df)

# Concatenate all the dataframes into a single one
result = pd.concat(dfs, ignore_index=True)


result.shape
print(result)


                domTLD domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  HIP  LCc  \
0               google    com  10.0   6.0   3.0   0.0   2.0  0.0  0.0  2.0   
1              youtube    com  11.0   7.0   3.0   0.0   2.0  0.0  0.0  1.0   
2             facebook    com  12.0   8.0   3.0   0.0   2.0  0.0  0.0  1.0   
3                baidu    com   9.0   5.0   3.0   0.0   2.0  0.0  0.0  1.0   
4            wikipedia    org  13.0   9.0   3.0   0.0   2.0  0.0  0.0  2.0   
...                ...    ...   ...   ...   ...   ...   ...  ...  ...  ...   
4300071   bringvarious    net  16.0  12.0   3.0   0.0   2.0  0.0  0.0  3.0   
4300072   escapesettle    net  16.0  12.0   3.0   0.0   2.0  0.0  0.0  3.0   
4300073   beautywonder    net  16.0  12.0   3.0   0.0   2.0  0.0  0.0  2.0   
4300074  actionwhether    net  17.0  13.0   3.0   0.0   2.0  0.0  0.0  3.0   
4300075              0   None   NaN   NaN   NaN   NaN   NaN  NaN  NaN  NaN   

         ...    RAv    RAl  RAn    RAs     Rc     Rv     Rl   R

# Clean the data

In [None]:
print(result.isnull().values.any())
print(result.isnull().sum())

# Drop rows with null values
result = result.dropna()
print(result.isnull().values.any())
print(result.isnull().sum())

True
domTLD     0
domSLD    76
Ldom      76
LTLD      76
LSLD      76
LOLD      76
Ddom      76
HwP       76
HIP       76
LCc       76
LCv       76
LCl       76
LCn       76
ACc       76
ACv       76
ACl       76
ACn       76
DCc       76
DCv       76
DCl       76
DCn       76
NCc       76
NCv       76
NCl       76
NCn       76
NCs       76
RAc       76
RAv       76
RAl       76
RAn       76
RAs       76
Rc        76
Rv        76
Rl        76
Rn        76
Rs        76
label      0
dtype: int64
False
domTLD    0
domSLD    0
Ldom      0
LTLD      0
LSLD      0
LOLD      0
Ddom      0
HwP       0
HIP       0
LCc       0
LCv       0
LCl       0
LCn       0
ACc       0
ACv       0
ACl       0
ACn       0
DCc       0
DCv       0
DCl       0
DCn       0
NCc       0
NCv       0
NCl       0
NCn       0
NCs       0
RAc       0
RAv       0
RAl       0
RAn       0
RAs       0
Rc        0
Rv        0
Rl        0
Rn        0
Rs        0
label     0
dtype: int64


In [None]:
#Verify if there is any duplicates
print(result.duplicated().any())
result = result.drop_duplicates()

True


In [None]:
result.shape

(4117751, 37)

In [None]:
print(result.duplicated().any())

False


In [None]:
# Shuffle the rows in the dataframe
result = result.sample(frac=1).reset_index(drop=True)

In [None]:
import gc
gc.collect()

0

### The feature HIP is always = 0 se maybe we don't need it !!!


In [None]:
print(result["HwP"].unique())
print(result["HIP"].unique())
print(result[result["HwP"]==1])

[0. 1.]
[0.]
                        domTLD    domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  \
26724    suggestminimum-course       xyz  29.0  21.0   3.0   3.0   3.0  1.0   
35066         sendseveralquote       net  24.0  16.0   3.0   3.0   3.0  1.0   
56163                      gov        ky  10.0   3.0   2.0   3.0   3.0  1.0   
69505       pullformalexercise       com  26.0  18.0   3.0   3.0   3.0  1.0   
76528                      www   hotmail  11.0   3.0   7.0   0.0   2.0  1.0   
...                        ...       ...   ...   ...   ...   ...   ...  ...   
4061895         become-rarered        in  21.0  14.0   2.0   3.0   3.0  1.0   
4064946                    gob        ve  10.0   3.0   2.0   3.0   3.0  1.0   
4067277   heardecentreflection        nl  27.0  20.0   2.0   3.0   3.0  1.0   
4072279    wantdesignerprogram  pictures  32.0  19.0   8.0   3.0   3.0  1.0   
4113739                   gouv        bj  11.0   4.0   2.0   3.0   3.0  1.0   

         HIP  LCc  ...    RAv    RAl  

### Mix the save the original data




In [None]:
# Save the resulting dataframe to a CSV file
result.to_csv('/content/drive/MyDrive/dga_dataset/dataset.csv', index=False)

In [None]:
#data= result.copy()

# Separate the features and labels
#X = result.drop('label', axis=1)
y = result['label']

# Instantiate a random undersampler
rus = RandomUnderSampler(random_state=42)

# Undersample the majority class
X_resam, y_resam = rus.fit_resample(result.drop('label', axis=1), result['label'])

# Combine the features and labels back into a dataframe
data_resampled = pd.concat([X_resam, y_resam], axis=1)

print(data_resampled,data_resampled.shape)

                           domTLD domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  \
0                         tavriav     ua  10.0   7.0   2.0   0.0   2.0  0.0   
1        fabulouseyebrowthreading    com  28.0  24.0   3.0   0.0   2.0  0.0   
2                        koutakia     gr  11.0   8.0   2.0   0.0   2.0  0.0   
3                 megatvonlinevip    com  19.0  15.0   3.0   0.0   2.0  0.0   
4                           roars     it   8.0   5.0   2.0   0.0   2.0  0.0   
...                           ...    ...   ...   ...   ...   ...   ...  ...   
1887161                iqsyrteiyy    com  14.0  10.0   3.0   0.0   2.0  0.0   
1887162              acfhjlakhnsm  bazar  18.0  12.0   5.0   0.0   2.0  0.0   
1887163                    cenada    biz  10.0   6.0   3.0   0.0   2.0  0.0   
1887164        typesnamesthiswere    com  22.0  18.0   3.0   0.0   2.0  0.0   
1887165                61d94e4f9f    net  14.0  10.0   3.0   0.0   2.0  0.0   

         HIP  LCc  ...    RAv    RAl  RAn    RAs   

In [None]:
# Count the number of instances in each category after undersampling
counts = data_resampled['label'].value_counts()
print(counts)

0    943583
1    943583
Name: label, dtype: int64


### Mix the save the balanced original data


In [None]:
# Save the resulting dataframe to a CSV file
data_resampled.to_csv('/content/drive/MyDrive/dga_dataset/dataset_balanced.csv', index=False)



---



In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
df=result.copy()
# Encode categorical columns
le = LabelEncoder()
df['domTLD'] = le.fit_transform(df['domTLD'])
df['domSLD'] = le.fit_transform(df['domSLD'])
print(df)

          domTLD  domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  HIP  LCc  ...  \
0        1978198     687  11.0   7.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
1        1427608     153  28.0  24.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
2         158002     775  36.0  32.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
3        1636507     153  12.0   8.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
4        1720797     506  18.0  14.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
...          ...     ...   ...   ...   ...   ...   ...  ...  ...  ...  ...   
4117746   377842     153  20.0  16.0   3.0   0.0   2.0  0.0  0.0  5.0  ...   
4117747  1796452     223  14.0  11.0   2.0   0.0   2.0  0.0  0.0  1.0  ...   
4117748   443023      71  41.0  37.0   3.0   0.0   2.0  0.0  0.0  4.0  ...   
4117749   716128     478  12.0   8.0   3.0   0.0   2.0  0.0  0.0  5.0  ...   
4117750  3256464     645  20.0  17.0   2.0   0.0   2.0  0.0  0.0  6.0  ...   

           RAv    RAl  RAn    RAs     Rc     Rv     Rl     Rn  

In [None]:
df.shape

(4117751, 37)

In [None]:
# Count the number of instances in each category
counts = df['label'].value_counts()
print(counts)

1    3174168
0     943583
Name: label, dtype: int64


In [None]:
dff= df.copy()

# Separate the features and labels
X = dff.drop('label', axis=1)
y = dff['label']

# Instantiate a random undersampler
rus = RandomUnderSampler(random_state=42)

# Undersample the majority class
X_resampled, y_resampled = rus.fit_resample(X, y)

# Combine the features and labels back into a dataframe
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

print(df_resampled,df_resampled.shape)

          domTLD  domSLD  Ldom  LTLD  LSLD  LOLD  Ddom  HwP  HIP  LCc  ...  \
0        3082872     357  13.0  10.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
1         308238     153  14.0  10.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
2        2696104     153  15.0  11.0   3.0   0.0   2.0  0.0  0.0  2.0  ...   
3        1068629     153  19.0  15.0   3.0   0.0   2.0  0.0  0.0  4.0  ...   
4         427867     578  13.0  10.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
...          ...     ...   ...   ...   ...   ...   ...  ...  ...  ...  ...   
1887161  2906936     153  14.0  10.0   3.0   0.0   2.0  0.0  0.0  5.0  ...   
1887162  2584431     153  28.0  24.0   3.0   0.0   2.0  0.0  0.0  4.0  ...   
1887163  2145418     579  20.0  17.0   2.0   0.0   2.0  0.0  0.0  2.0  ...   
1887164  3550279     153  30.0  26.0   3.0   0.0   2.0  0.0  0.0  3.0  ...   
1887165  3396447     153  24.0  20.0   3.0   0.0   2.0  0.0  0.0  6.0  ...   

           RAv    RAl  RAn    RAs     Rc     Rv     Rl   Rn    

In [None]:
# Count the number of instances in each category after undersampling
counts = df_resampled['label'].value_counts()
print(counts)

0    943583
1    943583
Name: label, dtype: int64


### Mix the save the original data after encoding the first two clomuns

In [None]:
# Save the resulting dataframe to a CSV file
df.to_csv('/content/drive/MyDrive/dga_dataset/dataset_num.csv', index=False)

### Mix the save the original data after encoding the first two clomuns and balancing the dataset ( legit domains = dga domains)

In [None]:
# Save the resulting dataframe to a CSV file
df_resampled.to_csv('/content/drive/MyDrive/dga_dataset/dataset_num_balanced.csv', index=False)