In [1]:
import pandas as pd

# File path
input_file = '/home/charlie/Desktop/Original_Data/Master_FIles/Master_Dataset.csv'
chunk_size = 10000

# Variables to hold results
unique_labels = set()  # Use a set to collect unique labels
label_counts = {}  # Dictionary to collect label counts

try:
    # Read file in chunks
    df_chunks = pd.read_csv(input_file, chunksize=chunk_size)
    for i, chunk in enumerate(df_chunks):
        print(f"Processing Chunk: {i + 1}")
        
        # Optional: Strip unwanted characters from column names (if needed)
        chunk.columns = chunk.columns.str.strip("'").str.strip()
        
        # Update unique labels
        unique_labels.update(chunk['detailed-label'].unique())
        
        # Update label counts
        for label, count in chunk['detailed-label'].value_counts().items():
            if label in label_counts:
                label_counts[label] += count
            else:
                label_counts[label] = count

except Exception as e:
    print(f"CRITICAL ERROR: {e}")

print("CELL COMPLETE")

# Print the results
print(f"Detailed Labels: {unique_labels}")
print(f"Detailed Label Count: {label_counts}")


Processing Chunk: 1
Processing Chunk: 2
Processing Chunk: 3
Processing Chunk: 4
Processing Chunk: 5
Processing Chunk: 6
Processing Chunk: 7
Processing Chunk: 8
Processing Chunk: 9
Processing Chunk: 10
Processing Chunk: 11
Processing Chunk: 12
Processing Chunk: 13
Processing Chunk: 14
Processing Chunk: 15
Processing Chunk: 16
Processing Chunk: 17
Processing Chunk: 18
Processing Chunk: 19
Processing Chunk: 20
Processing Chunk: 21
Processing Chunk: 22
Processing Chunk: 23
Processing Chunk: 24
Processing Chunk: 25
Processing Chunk: 26
Processing Chunk: 27
Processing Chunk: 28
Processing Chunk: 29
Processing Chunk: 30
Processing Chunk: 31
Processing Chunk: 32
Processing Chunk: 33
Processing Chunk: 34
Processing Chunk: 35
Processing Chunk: 36
Processing Chunk: 37
Processing Chunk: 38
Processing Chunk: 39
Processing Chunk: 40
Processing Chunk: 41
Processing Chunk: 42
Processing Chunk: 43
Processing Chunk: 44
Processing Chunk: 45
Processing Chunk: 46
Processing Chunk: 47
Processing Chunk: 48
P

In [2]:
# Define the dictionary of malicious labels and their counts
Malicious_labels = {
    "C&C": 21995,
    "PartOfAHorizontalPortScan": 213852924,
    "DDoS": 19538713,
    "Okiru": 60990708,
    "C&C-HeartBeat": 33673,
    "PartOfAHorizontalPortScan-Attack": 5,
    "Attack": 9398,
    "C&C-HeartBeat-FileDownload": 11,
    "C&C-HeartBeat-Attack": 834,
    "C&C-PartOfAHorizontalPortScan": 888,
    "C&C-FileDownload": 53,
    "FileDownload": 18,
    "C&C-Mirai": 2,
    "C&C-Torii": 30,
    "Okiru-Attack": 3
}

# Iterate through the dictionary and print counts greater than 800
for label, count in Malicious_labels.items():
    if count > 800:
        print(f"{label}: {count}")


C&C: 21995
PartOfAHorizontalPortScan: 213852924
DDoS: 19538713
Okiru: 60990708
C&C-HeartBeat: 33673
Attack: 9398
C&C-HeartBeat-Attack: 834
C&C-PartOfAHorizontalPortScan: 888


In [7]:
import pandas as pd
import os

# Input and output paths
input_file = '/home/charlie/Desktop/Original_Data/Master_FIles/Master_Dataset.csv'
output_dir = '/home/charlie/Desktop/Original_Data/Detailed_Label_Subset/'  # Adjust to your desired path
Detailed_Label = ["C&C", "PartOfAHorizontalPortScan", "DDoS", "Okiru", 
                  "C&C-HeartBeat", "Attack", "C&C-HeartBeat-Attack", 
                  "C&C-PartOfAHorizontalPortScan"]

chunk_size = 10000
write_limit = 800
write_count = 0

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

try:
    # Read file in chunks
    df_chunks = pd.read_csv(input_file, chunksize=chunk_size)
    for i, chunk in enumerate(df_chunks):
        print(f"Processing Chunk: {i + 1}")
        
        # Strip unwanted characters from column names
        chunk.columns = chunk.columns.str.strip("'").str.strip()
        
        # Check if 'detailed-label' exists in the chunk
        if 'detailed-label' not in chunk.columns:
            print(f"CRITICAL ERROR: 'detailed-label' column not found in chunk {i + 1}.")
            continue
        
        # Process each label
        for label in Detailed_Label:
            filtered_chunk = chunk[chunk['detailed-label'] == label]
            if not filtered_chunk.empty:
                write_count += 1
                
                if write_count > write_limit:
                    print(f"Write limit of {write_limit} reached for label '{label}', moving onto next label.")
                    continue  # Skip to the next label

                # Define output file path
                output_file = os.path.join(output_dir, f"{label}.csv")
                
                # Append to file if it exists, otherwise create a new one
                if os.path.exists(output_file):
                    filtered_chunk.to_csv(output_file, mode='a', header=False, index=False)
                else:
                    filtered_chunk.to_csv(output_file, mode='w', header=True, index=False)
                        
except Exception as e:
    print(f"CRITICAL ERROR: {e}")

print("CELL COMPLETE")


Processing Chunk: 1
Processing Chunk: 2
Processing Chunk: 3
Processing Chunk: 4
Processing Chunk: 5
Processing Chunk: 6
Processing Chunk: 7
Processing Chunk: 8
Processing Chunk: 9
Processing Chunk: 10
Processing Chunk: 11
Processing Chunk: 12
Processing Chunk: 13
Processing Chunk: 14
Processing Chunk: 15
Processing Chunk: 16
Processing Chunk: 17
Processing Chunk: 18
Processing Chunk: 19
Processing Chunk: 20
Processing Chunk: 21
Processing Chunk: 22
Processing Chunk: 23
Processing Chunk: 24
Processing Chunk: 25
Processing Chunk: 26
Processing Chunk: 27
Processing Chunk: 28
Processing Chunk: 29
Processing Chunk: 30
Processing Chunk: 31
Processing Chunk: 32
Processing Chunk: 33
Processing Chunk: 34
Processing Chunk: 35
Processing Chunk: 36
Processing Chunk: 37
Processing Chunk: 38
Processing Chunk: 39
Processing Chunk: 40
Processing Chunk: 41
Processing Chunk: 42
Processing Chunk: 43
Processing Chunk: 44
Processing Chunk: 45
Processing Chunk: 46
Processing Chunk: 47
Processing Chunk: 48
P