<a href="https://colab.research.google.com/github/Np2525/BudgetWise-Model/blob/main/GuideWire_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile
import os

zip_path = "/content/Horizontal Scaling in Kubernetes Dataset Using Artificial Neural Networks for Load Forecasting.zip"  # Update with your file path
extract_path = "/content/extracted_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files
os.listdir(extract_path)


['Horizontal Scaling in Kubernetes Dataset Using Artificial Neural Networks for Load Forecasting']

In [7]:
import pandas as pd
import numpy as np


In [8]:
# Define file paths
jmeter_path = "/content/extracted_data/Horizontal Scaling in Kubernetes Dataset Using Artificial Neural Networks for Load Forecasting/TestJMeterData.csv"
k8s_path = "/content/extracted_data/Horizontal Scaling in Kubernetes Dataset Using Artificial Neural Networks for Load Forecasting/TestK8sData.csv"
train_path = "/content/extracted_data/Horizontal Scaling in Kubernetes Dataset Using Artificial Neural Networks for Load Forecasting/TrainData.csv"

# Load datasets
df_jmeter = pd.read_csv(jmeter_path)
df_k8s = pd.read_csv(k8s_path)
df_train = pd.read_csv(train_path)

# Show dataset summaries
print("JMeter Data Sample:\n", df_jmeter.head(), "\n")
print("Kubernetes Data Sample:\n", df_k8s.head(), "\n")
print("Training Data Sample:\n", df_train.head(), "\n")


JMeter Data Sample:
                  timeStamp  elapsed         label  responseCode  \
0  24-07-2023 22:13:10.914      709  HTTP Request         200.0   
1  24-07-2023 22:13:12.579        9  HTTP Request         200.0   
2  24-07-2023 22:13:14.703        4  HTTP Request         200.0   
3  24-07-2023 22:13:15.887        4  HTTP Request         200.0   
4  24-07-2023 22:13:16.390        4  HTTP Request         200.0   

  responseMessage                   threadName dataType  success  bytes  \
0              OK  Open Model Thread Group 1-1     text     True    201   
1              OK  Open Model Thread Group 1-2     text     True    201   
2              OK  Open Model Thread Group 1-3     text     True    201   
3              OK  Open Model Thread Group 1-4     text     True    201   
4              OK  Open Model Thread Group 1-5     text     True    201   

   sentBytes  ...  ErrorCount   Hostname IdleTime  Connect  InitialPodsNumber  \
0        146  ...           0  lg-server    

In [9]:
# Convert JMeter timestamp
df_jmeter['timeStamp'] = pd.to_datetime(df_jmeter['timeStamp'], errors='coerce')

# Convert Kubernetes timestamp
df_k8s.rename(columns={'Timestamp': 'timeStamp'}, inplace=True)  # Rename for merging
df_k8s['timeStamp'] = pd.to_datetime(df_k8s['timeStamp'], errors='coerce')

# Sort data by timestamps
df_jmeter = df_jmeter.sort_values(by='timeStamp')
df_k8s = df_k8s.sort_values(by='timeStamp')


  df_jmeter['timeStamp'] = pd.to_datetime(df_jmeter['timeStamp'], errors='coerce')


In [10]:
# Round timestamps to the nearest second for better merging
df_jmeter['timeStamp'] = df_jmeter['timeStamp'].dt.round('S')
df_k8s['timeStamp'] = df_k8s['timeStamp'].dt.round('S')

# Merge datasets on timeStamp
df_merged = pd.merge_asof(df_jmeter, df_k8s, on="timeStamp", direction='nearest')

# Show merged data
print("Merged Dataset Sample:\n", df_merged.head())


Merged Dataset Sample:
             timeStamp  elapsed         label  responseCode responseMessage  \
0 2023-07-24 15:18:03      914  HTTP Request         200.0              OK   
1 2023-07-24 15:18:03      800  HTTP Request         200.0              OK   
2 2023-07-24 15:18:03       46  HTTP Request         200.0              OK   
3 2023-07-24 15:18:04      108  HTTP Request         200.0              OK   
4 2023-07-24 15:18:04        4  HTTP Request         200.0              OK   

                    threadName dataType  success  bytes  sentBytes  ...  \
0  Open Model Thread Group 1-1     text     True    201        146  ...   
1  Open Model Thread Group 1-2     text     True    201        146  ...   
2  Open Model Thread Group 1-3     text     True    201        146  ...   
3  Open Model Thread Group 1-4     text     True    201        146  ...   
4  Open Model Thread Group 1-5     text     True    201        146  ...   

   CPUThreshold_x       CPU  PackRecv  PodsNumber  Durat

  df_jmeter['timeStamp'] = df_jmeter['timeStamp'].dt.round('S')
  df_k8s['timeStamp'] = df_k8s['timeStamp'].dt.round('S')


In [15]:
df_merged = pd.merge_asof(
    df_jmeter.sort_values("timeStamp"),
    df_k8s.sort_values("timeStamp"),
    on="timeStamp",
    direction="nearest"  # Finds the closest matching timestamp
)


In [16]:
print("Merged Data Columns:\n", df_merged.columns)


Merged Data Columns:
 Index(['timeStamp', 'elapsed', 'label', 'responseCode', 'responseMessage',
       'threadName', 'dataType', 'success', 'bytes', 'sentBytes', 'grpThreads',
       'allThreads', 'URL', 'Latency', 'SampleCount', 'ErrorCount', 'Hostname',
       'IdleTime', 'Connect', 'InitialPodsNumber_x', 'StressUpRate_x',
       'Experiment_x', 'Type_x', 'Duration_x', 'CPUThreshold_x', 'CPU',
       'PackRecv', 'PodsNumber', 'Duration_y', 'InitialPodsNumber_y',
       'StressUpRate_y', 'Experiment_y', 'Type_y', 'CPUThreshold_y'],
      dtype='object')


In [19]:
df_jmeter.columns = df_jmeter.columns.str.strip()
df_k8s.columns = df_k8s.columns.str.strip()


In [20]:
df_merged = pd.merge_asof(
    df_jmeter.sort_values("timeStamp"),
    df_k8s.sort_values("timeStamp"),
    on="timeStamp",
    direction="nearest"
)


In [21]:
print("Merged Data Columns:", df_merged.columns.tolist())


Merged Data Columns: ['timeStamp', 'elapsed', 'label', 'responseCode', 'responseMessage', 'threadName', 'dataType', 'success', 'bytes', 'sentBytes', 'grpThreads', 'allThreads', 'URL', 'Latency', 'SampleCount', 'ErrorCount', 'Hostname', 'IdleTime', 'Connect', 'InitialPodsNumber_x', 'StressUpRate_x', 'Experiment_x', 'Type_x', 'Duration_x', 'CPUThreshold_x', 'CPU', 'PackRecv', 'PodsNumber', 'Duration_y', 'InitialPodsNumber_y', 'StressUpRate_y', 'Experiment_y', 'Type_y', 'CPUThreshold_y']


In [26]:
# Drop the duplicate columns (_y)
df_merged.drop(columns=["StressUpRate_y", "CPUThreshold_y", "InitialPodsNumber_y", "Experiment_y", "Type_y", "Duration_y"], inplace=True, errors="ignore")

# Rename the _x columns to remove the suffix
df_merged.rename(columns={
    "StressUpRate_x": "StressUpRate",
    "CPUThreshold_x": "CPUThreshold",
    "InitialPodsNumber_x": "InitialPodsNumber",
    "Experiment_x": "Experiment",
    "Type_x": "Type",
    "Duration_x": "Duration"
}, inplace=True)

# Select relevant columns for final dataset
df_final = df_merged[[
    "timeStamp", "elapsed", "responseCode", "success", "bytes",
    "CPU", "PackRecv", "PodsNumber", "StressUpRate", "CPUThreshold"
]]

# Save the cleaned dataset
df_final.to_csv("/content/processed_k8s_dataset.csv", index=False)
print("Processed dataset saved successfully!")


Processed dataset saved successfully!


In [27]:
print(df_final.head())


            timeStamp  elapsed  responseCode  success  bytes       CPU  \
0 2023-07-24 15:18:03      914         200.0     True    201  0.532484   
1 2023-07-24 15:18:03      800         200.0     True    201  0.532484   
2 2023-07-24 15:18:03       46         200.0     True    201  0.532484   
3 2023-07-24 15:18:04      108         200.0     True    201  0.532484   
4 2023-07-24 15:18:04        4         200.0     True    201  0.532484   

   PackRecv  PodsNumber  StressUpRate  CPUThreshold  
0  0.512426           1             5             0  
1  0.512426           1             5             0  
2  0.512426           1             5             0  
3  0.512426           1             5             0  
4  0.512426           1             5             0  


In [31]:
import pandas as pd

# Load the datasets
benign_df = pd.read_csv("elastic_may2021_benign_data.csv")
malicious_df = pd.read_csv("elastic_may2021_malicious_data.csv")
data_df = pd.read_csv("elastic_may2022_data.csv")

# Print column names to inspect
print("Benign Data Columns:", benign_df.columns)
print("Malicious Data Columns:", malicious_df.columns)
print("Data Columns:", data_df.columns)


Benign Data Columns: Index(['_source_flow_id', '_source_flow_final', '_source_source_ip',
       '_source_destination_ip', '_source_network_bytes',
       '_source_network_transport', '_source_@timestamp',
       '_source_event_duration', '_source_destination_port',
       '_source_source_port', 'label'],
      dtype='object')
Malicious Data Columns: Index(['_source_flow_id', '_source_flow_final', '_source_source_ip',
       '_source_destination_ip', '_source_network_bytes',
       '_source_network_transport', '_source_@timestamp',
       '_source_event_duration', '_source_destination_port',
       '_source_source_port', 'label'],
      dtype='object')
Data Columns: Index(['_source_flow_id;_source_flow_final;_source_source_ip;_source_destination_ip;_source_network_bytes;_source_network_transport;_source_@timestamp;_source_event_duration;_source_destination_port;_source_source_port;_source_network_packets;label'], dtype='object')


In [33]:
import pandas as pd

# Load the "Data" file
data_df = pd.read_csv("elastic_may2022_data.csv", delimiter=";")  # Specify ";" as separator

print("Fixed Data Columns:", data_df.columns)
print(data_df.head())


Fixed Data Columns: Index(['_source_flow_id', '_source_flow_final', '_source_source_ip',
       '_source_destination_ip', '_source_network_bytes',
       '_source_network_transport', '_source_@timestamp',
       '_source_event_duration', '_source_destination_port',
       '_source_source_port', '_source_network_packets', 'label'],
      dtype='object')
                           _source_flow_id  _source_flow_final  \
0  EAT/////AP//////CP8AAAEKAAIPwKj3BYUDAQg               False   
1  EAT/////AP//////CP8AAAHAqFSXwKj3Fk6XVCQ               False   
2  EAT/////AP//////CP8AAAHAqFSAwKj3BYYDAQg               False   
3  EAT/////AP//////CP8AAAGsEAIKrBACDCsZxMs               False   
4  EAT/////AP//////CP8AAAHAqFSXwKj3FmKXVCQ               False   

  _source_source_ip _source_destination_ip  _source_network_bytes  \
0     95.180.199.26        244.121.253.144              476424919   
1      95.180.199.6          95.180.91.120              158570149   
2      95.180.91.98          95.180.199.2

In [34]:
# Load Benign & Malicious datasets
benign_df = pd.read_csv("elastic_may2021_benign_data.csv")
malicious_df = pd.read_csv("elastic_may2021_malicious_data.csv")

# Concatenate all datasets
moss_merged_df = pd.concat([benign_df, malicious_df, data_df], ignore_index=True)

# Save the cleaned dataset
moss_merged_df.to_csv("merged_moss_data.csv", index=False)

print("✅ Merged AssureMOSS dataset saved successfully!")


✅ Merged AssureMOSS dataset saved successfully!


In [35]:
pip install pandas numpy faker


Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0


In [36]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker for synthetic IPs
fake = Faker()

# Define simulation parameters
NUM_SAMPLES = 5000  # Adjust for larger/smaller datasets
FAILURE_TYPES = ["PodCrash", "CPUHigh", "MemoryLeak", "NetworkLatency", "DiskPressure"]
NODES = ["node-1", "node-2", "node-3", "node-4", "node-5"]

# Generate synthetic Kubernetes failure data
data = {
    "timestamp": pd.date_range(start="2025-01-01", periods=NUM_SAMPLES, freq="T"),
    "node": np.random.choice(NODES, NUM_SAMPLES),
    "pod_name": [f"pod-{i}" for i in range(NUM_SAMPLES)],
    "namespace": np.random.choice(["default", "kube-system", "production"], NUM_SAMPLES),
    "failure_type": np.random.choice(FAILURE_TYPES, NUM_SAMPLES, p=[0.2, 0.25, 0.25, 0.15, 0.15]),
    "cpu_usage": np.random.uniform(0, 100, NUM_SAMPLES),
    "memory_usage": np.random.uniform(0, 16, NUM_SAMPLES),  # GB
    "network_latency": np.random.uniform(0, 500, NUM_SAMPLES),  # ms
    "source_ip": [fake.ipv4() for _ in range(NUM_SAMPLES)],
    "destination_ip": [fake.ipv4() for _ in range(NUM_SAMPLES)],
    "status": np.random.choice(["Recovered", "Ongoing", "Critical"], NUM_SAMPLES, p=[0.6, 0.3, 0.1])
}

# Create DataFrame
simulated_df = pd.DataFrame(data)

# Save to CSV
simulated_df.to_csv("simulated_k8s_failures.csv", index=False)

print("✅ Simulated Kubernetes failure dataset saved as 'simulated_k8s_failures.csv'.")


  "timestamp": pd.date_range(start="2025-01-01", periods=NUM_SAMPLES, freq="T"),


✅ Simulated Kubernetes failure dataset saved as 'simulated_k8s_failures.csv'.


In [None]:
/content/simulated_k8s_failures.csv

In [38]:
import pandas as pd

# Define the file path
file_path = "/content/simulated_k8s_failures.csv"

# Load the CSV file
df_simulated = pd.read_csv(file_path)

# Display the first few rows
print(df_simulated.head())

# Display column names
print("\nColumns in the dataset:\n", df_simulated.columns)


             timestamp    node pod_name   namespace  failure_type  cpu_usage  \
0  2025-01-01 00:00:00  node-5    pod-0     default  DiskPressure  88.059845   
1  2025-01-01 00:01:00  node-1    pod-1  production       CPUHigh   0.072842   
2  2025-01-01 00:02:00  node-1    pod-2     default      PodCrash  17.443240   
3  2025-01-01 00:03:00  node-3    pod-3  production       CPUHigh  58.817619   
4  2025-01-01 00:04:00  node-4    pod-4  production    MemoryLeak  46.992904   

   memory_usage  network_latency      source_ip   destination_ip     status  
0     12.621836        20.926611  193.172.43.15  179.255.186.207  Recovered  
1     12.101612       146.511933   95.146.85.55     166.4.177.97  Recovered  
2      9.174196       334.092657  195.253.46.28    34.167.27.192  Recovered  
3     14.752103       169.867212  125.106.40.35   85.227.219.223    Ongoing  
4     11.958055        61.501088  138.5.130.148  155.143.192.224  Recovered  

Columns in the dataset:
 Index(['timestamp', 'node

In [40]:
import pandas as pd

# File paths (update paths if necessary)
assuremoss_path = "/content/merged_moss_data.csv"
k8s_path = "/content/processed_k8s_dataset.csv"
simulated_path = "/content/simulated_k8s_failures.csv"

# Load datasets
df_assuremoss = pd.read_csv(assuremoss_path)
df_k8s = pd.read_csv(k8s_path)
df_simulated = pd.read_csv(simulated_path)


In [43]:
df_assuremoss.rename(columns={"_source_@timestamp": "timeStamp"}, inplace=True)
df_simulated.rename(columns={"timestamp": "timeStamp"}, inplace=True)


In [44]:
df_final = df_k8s.merge(df_assuremoss, on="timeStamp", how="outer")
df_final = df_final.merge(df_simulated, on="timeStamp", how="outer")


In [45]:
final_path = "final_merged_dataset.csv"
df_final.to_csv(final_path, index=False)
print(f"✅ Final merged dataset saved successfully at: {final_path}")


✅ Final merged dataset saved successfully at: final_merged_dataset.csv


In [46]:
print(df_final.isnull().sum())


timeStamp                          0
elapsed                      4697496
responseCode                 4797108
success                      4697496
bytes                        4697496
CPU                          4697496
PackRecv                     4697496
PodsNumber                   4697496
StressUpRate                 4697496
CPUThreshold                 4697496
_source_flow_id               508960
_source_flow_final            508960
_source_source_ip             508960
_source_destination_ip        508960
_source_network_bytes         508960
_source_network_transport     514312
_source_event_duration        508960
_source_destination_port      515294
_source_source_port           515294
label                         508960
_source_network_packets      4240410
node                         5196456
pod_name                     5196456
namespace                    5196456
failure_type                 5196456
cpu_usage                    5196456
memory_usage                 5196456
n

In [47]:
print(df_final.dtypes)


timeStamp                     object
elapsed                      float64
responseCode                 float64
success                       object
bytes                        float64
CPU                          float64
PackRecv                     float64
PodsNumber                   float64
StressUpRate                 float64
CPUThreshold                 float64
_source_flow_id               object
_source_flow_final            object
_source_source_ip             object
_source_destination_ip        object
_source_network_bytes        float64
_source_network_transport     object
_source_event_duration       float64
_source_destination_port     float64
_source_source_port          float64
label                         object
_source_network_packets      float64
node                          object
pod_name                      object
namespace                     object
failure_type                  object
cpu_usage                    float64
memory_usage                 float64
n

In [49]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
