[Malware Link](https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-44/)

Probable Name: Rbot

MD5: 2467b3c8b259cecd6ce2d5c31009df10

SHA1: 915934b43d63dc4040af3ea1ee6c80913288ff3b

SHA256: dcf50510efec16ff10c5aed91c8e386aba114e63842caa16ea40cac776c60816

Password of zip file: infected

Duration: 2 days, 18 hours, 49 minutes and 0 seconds

[VirusTotal](https://www.virustotal.com/gui/file/dcf50510efec16ff10c5aed91c8e386aba114e63842caa16ea40cac776c60816/detection)

HybridAnalysis

In [1]:
import pandas as pd

## Data Preparation

### Data Loading

In [3]:
# Define the path to your file
file_path = r"../../data/Rbot/capture20110812.pcap.netflow.labeled"

df = pd.read_csv(
    file_path,
    sep="\s+",
    skiprows=1,
    header=None,
    names=[
        "Date",
        "Flow Start",
        "Durat",
        "Prot",
        "Src_IP_Addr_Port",
        "Dir",
        "Dst_IP_Addr_Port",
        "Flags",
        "Tos",
        "Packets",
        "Bytes",
        "Flows",
        "Label",
    ],
)
df.head()

Unnamed: 0,Date,Flow Start,Durat,Prot,Src_IP_Addr_Port,Dir,Dst_IP_Addr_Port,Flags,Tos,Packets,Bytes,Flows,Label
0,2011-08-12,15:24:01.105,4.677,TCP,74.125.108.243:80,->,147.32.86.187:55707,PA_,0,452,682688,1,Background
1,2011-08-12,15:24:01.105,3.11,TCP,147.32.84.164:22,->,83.208.193.123:15173,PA_,0,5,506,1,LEGITIMATE
2,2011-08-12,15:24:01.105,4.678,TCP,147.32.86.187:55707,->,74.125.108.243:80,A_,0,106,6510,1,LEGITIMATE
3,2011-08-12,15:24:01.106,4.989,TCP,217.168.211.184:3953,->,147.32.84.14:22,PA_,0,1393,95812,1,Background
4,2011-08-12,15:24:01.113,4.988,UDP,173.9.132.155:54369,->,147.32.84.59:15046,INT,0,811,680898,1,Background


In [6]:
df.drop(["Date", "Dir", "Tos", "Flow Start"], axis=1, inplace=True)

### Data Balancaing

In [7]:
df["Label"].value_counts()

Label
Background    14381899
LEGITIMATE      744270
Botnet           75891
Name: count, dtype: int64

In [8]:
from sklearn.utils import shuffle


def iteratively_balance_dataframe(df):
    # Calculate minimum size from the minority classes
    min_size = min(df["Label"].value_counts())

    # Loop until the sizes of all the unbalanced classes are reduced to close to the minimum size
    while True:
        # Calculate the number of samples in each class
        class_counts = df["Label"].value_counts()
        print("Current class distribution:")
        print(class_counts)

        # Determine if all classes are balanced within a reasonable range
        if all(class_count <= min_size * 1.1 for class_count in class_counts):
            break

        # Reduce each class by approximately 20%
        for label in class_counts.index:
            if class_counts[label] > min_size:
                # Calculate 20% of the current class count above the minimum size
                num_to_reduce = int((class_counts[label] - min_size) * 0.2)
                # Sample the rows to drop
                rows_to_drop = (
                    df[df["Label"] == label]
                    .sample(n=num_to_reduce, random_state=1)
                    .index
                )
                # Drop the rows
                df = df.drop(rows_to_drop)

        # Shuffle the dataset after modification

        df = shuffle(df, random_state=1)

    # Return the balanced DataFrame
    return df


# Balance the dataset
df_balanced = iteratively_balance_dataframe(df)

# Reset index for the shuffled dataset
df_balanced.reset_index(drop=True, inplace=True)

Current class distribution:
Label
Background    14381899
LEGITIMATE      744270
Botnet           75891
Name: count, dtype: int64
Current class distribution:
Label
Background    11520698
LEGITIMATE      610595
Botnet           75891
Name: count, dtype: int64
Current class distribution:
Label
Background    9231737
LEGITIMATE     503655
Botnet          75891
Name: count, dtype: int64
Current class distribution:
Label
Background    7400568
LEGITIMATE     418103
Botnet          75891
Name: count, dtype: int64
Current class distribution:
Label
Background    5935633
LEGITIMATE     349661
Botnet          75891
Name: count, dtype: int64
Current class distribution:
Label
Background    4763685
LEGITIMATE     294907
Botnet          75891
Name: count, dtype: int64
Current class distribution:
Label
Background    3826127
LEGITIMATE     251104
Botnet          75891
Name: count, dtype: int64
Current class distribution:
Label
Background    3076080
LEGITIMATE     216062
Botnet          75891
Name: count,

### Column Naming and Separation

In [10]:
df_balanced.rename(columns={"Durat": "Duration", "Prot": "Protocol"}, inplace=True)

In [11]:
def split_ip_port(ip_port):
    parts = ip_port.split(":")
    if len(parts) == 2:
        return parts[0], parts[1]  # Return IP and port
    elif len(parts) == 1:
        return parts[0], "Unknown"  # Return IP and assume no port available
    else:
        return (
            "Unknown",
            "Unknown",
        )  # Return None for both IP and port if unexpected format


# Apply the function to the 'Src_IP_Addr_Port' column
df_balanced["Source IP"], df_balanced["Source Port"] = zip(
    *df_balanced["Src_IP_Addr_Port"].map(split_ip_port)
)

# Apply the function to the 'Dst_IP_Addr_Port' column
df_balanced["Destination IP"], df_balanced["Destination Port"] = zip(
    *df_balanced["Dst_IP_Addr_Port"].map(split_ip_port)
)

# Drop the original combined columns if they are no longer needed
df_balanced.drop(["Src_IP_Addr_Port", "Dst_IP_Addr_Port"], axis=1, inplace=True)

In [12]:
df_balanced.head()

Unnamed: 0,Duration,Protocol,Flags,Packets,Bytes,Flows,Label,Source IP,Source Port,Destination IP,Destination Port
0,1.178,TCP,RA_,3,180,1,Botnet,208.100.41.178,22,147.32.84.165,2411
1,0.0,UDP,INT,1,81,1,Background,147.32.84.138,52506,147.32.80.9,53
2,0.0,UDP,INT,1,81,1,LEGITIMATE,147.32.84.170,42438,147.32.80.9,53
3,0.0,TCP,S_,1,62,1,Botnet,147.32.84.165,4508,209.14.30.71,22
4,0.0,UDP,INT,1,133,1,Background,147.32.80.9,53,147.32.84.138,44570


## Data Engineering

In [13]:
engineering_df = df_balanced.copy()

In [14]:
engineering_df["Duration"] = engineering_df["Duration"].replace(0.000, 0.001)


engineering_df["Bytes per Packet"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Packets"] if row["Packets"] > 0 else 0, axis=1
)

# Calculate 'Packets per Second' and 'Bytes per Second' if duration is not zero
engineering_df["Packets per Second"] = engineering_df.apply(
    lambda row: row["Packets"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)
engineering_df["Bytes per Second"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

In [15]:
def clean_port(port):
    try:
        # Attempt to convert port to integer
        return int(port)
    except ValueError:
        # If conversion fails, return 0
        return 0


# Clean the 'Destination Port' data
engineering_df["Destination Port"] = engineering_df["Destination Port"].apply(
    clean_port
)

engineering_df["Source Port"] = engineering_df["Source Port"].apply(clean_port)


def is_encrypted_protocol(port):
    encrypted_ports = {443, 22, 993, 995, 465, 587, 636, 989, 990, 992, 1194, 500}
    return 1 if port in encrypted_ports else 0


def is_common_port(port):
    common_ports = {80, 443, 21, 22, 25, 110, 143, 3306, 3389, 5900, 53, 23}
    return 1 if port in common_ports else 0


engineering_df["Destination Port"] = engineering_df["Destination Port"].astype(int)
engineering_df["Source Port"] = engineering_df["Source Port"].astype(int)


# Apply the encryption check
engineering_df["Is Encrypted Traffic"] = engineering_df["Destination Port"].apply(
    is_encrypted_protocol
)

# Apply the common port check
engineering_df["Common Port Usage"] = engineering_df["Destination Port"].apply(
    is_common_port
)

In [16]:
engineering_df.drop(["Destination IP", "Source IP","Flows"], axis=1, inplace=True)

In [17]:
engineering_df["Label"] = engineering_df["Label"].replace({"LEGITIMATE": "Normal", "Botnet": "Rbot"})

In [20]:
df = engineering_df.copy()


df.to_csv(r"../../data/Rbot/Rbot.csv")