In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Configurable parameters
input_file = './TON_IoT/TON_IoT_Train_Test_Network.csv'
output_folder = './TON_IoT/'
train_ratio = 0.7
val_test_ratio = 0.15 / (0.15 + 0.15)
random_seed = 42

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Check if the input file exists
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file not found: {input_file}")

# Load the dataset
data = pd.read_csv(input_file)
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")

# Split the data into training and remaining sets
train_data, remaining_data = train_test_split(
    data, test_size=1-train_ratio, random_state=random_seed, shuffle=True
)

# Split the remaining data into validation and test sets
val_data, test_data = train_test_split(
    remaining_data, test_size=1-val_test_ratio, random_state=random_seed, shuffle=True
)

# Print the sizes of each split
print(f"Training set: {train_data.shape[0]} rows")
print(f"Validation set: {val_data.shape[0]} rows")
print(f"Test set: {test_data.shape[0]} rows")

# Define output file paths
train_file = os.path.join(output_folder, 'train.csv')
val_file = os.path.join(output_folder, 'val.csv')
test_file = os.path.join(output_folder, 'test.csv')

# Save the splits to CSV files
train_data.to_csv(train_file, index=False)
val_data.to_csv(val_file, index=False)
test_data.to_csv(test_file, index=False)

print(f"Files saved to {output_folder}")


Dataset loaded with 461043 rows and 45 columns.
Training set: 322730 rows
Validation set: 69156 rows
Test set: 69157 rows
Files saved to ./TON_IoT/
