In [1]:
!pip install pandas scikit-learn keras numpy



In [2]:
import re
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from typing import List
from collections import deque
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# If using MSE or RMSE
from sklearn.metrics import mean_squared_error

In [3]:
# ============================
# 1) Reading and Parsing Data
# ============================
file_path = 'GenData.txt'

with open(file_path, 'r') as file:
    content = file.read()

burst_times_lists = []
gangs_lists = []
processes_lists = []

# Extract processes arrays
for idx, chunk in enumerate(content.split('processes =')):
    if idx:
        parsed_str = chunk.split(']\n')[0].strip()
        processes_lists.append(ast.literal_eval(f"{parsed_str}]"))

# Extract gangs arrays
for idx, chunk in enumerate(content.split('gangs =')):
    if idx:
        parsed_str = chunk.split('\n]')[0].strip()
        gangs_lists.append(ast.literal_eval(f"{parsed_str}]"))

# Extract burst_times arrays
for idx, chunk in enumerate(content.split('burst_times =')):
    if idx:
        parsed_str = chunk.split('\n]')[0].strip()
        burst_times_lists.append(ast.literal_eval(f"{parsed_str}]"))

# For demonstration, let's assume we only care about the FIRST sets 
# (if your file has multiple sets, pick the one you want)
processes = processes_lists[0] if processes_lists else []
gangs = gangs_lists[0] if gangs_lists else []
burst_times_all = burst_times_lists[0] if burst_times_lists else []

In [4]:
# ============================
# 2) Build a DataFrame of CPU vs. IO
# ============================
CPU_BURST = []
IO_BURST = []

def split_list_every_other(lst: List[int]):
    even_index_elements = lst[::2]  # CPU bursts at even indices
    odd_index_elements = lst[1::2]  # IO bursts at odd indices
    return even_index_elements, odd_index_elements

for burst_pattern in burst_times_all:
    # e.g. burst_pattern might be [5,2,6,8,7] ...
    if len(burst_pattern) % 2 != 0:
        # if there's an odd number of bursts, drop the last for CPU/IO alignment
        temp_cpu, temp_io = split_list_every_other(burst_pattern[:-1])
    else:
        temp_cpu, temp_io = split_list_every_other(burst_pattern)

    CPU_BURST.extend(temp_cpu)
    IO_BURST.extend(temp_io)

df = pd.DataFrame({
    'CPU_BURST': CPU_BURST,
    'IO_BURST': IO_BURST
})

print("Built DataFrame (CPU_BURST vs IO_BURST) head:")
print(df.head())


Built DataFrame (CPU_BURST vs IO_BURST) head:
   CPU_BURST  IO_BURST
0         40        18
1         26         5
2         45         4
3         35        14
4          7        41


# Window Training

In [5]:

# ============================
# 3) Sliding Window Approach
#    - Build 30-row features => predict next IO
# ============================
window_size = 30
window_features = []
window_targets = []

# For each i, take df.iloc[i : i+window_size+1], flatten, remove last
for i in range(len(df) - window_size):
    # e.g. shape: (31 rows, 2 columns) => flatten => 62 elements
    chunk = df.iloc[i : i + window_size + 1].values.flatten()
    # If your original code did .flatten()[:-1], replicate that
    # We'll do it here:
    feature = chunk[:-1]  # remove last element if that was your logic
    # The target: the 'IO_BURST' of the row i+window_size
    # i+window_size = the last row in that chunk
    next_io = df.iloc[i + window_size]['IO_BURST']
    
    window_features.append(feature)
    window_targets.append(next_io)

X_window = window_features
y_window = window_targets

# Let's do a train/test split
Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_window, y_window, test_size=0.2, random_state=42
)

model_window = RandomForestRegressor(n_estimators=100, random_state=42)
model_window.fit(Xw_train, yw_train)

# y_pred_window = model_window.predict(Xw_test)
# rmse_window = mean_squared_error(yw_test, y_pred_window, squared=False)
# print(f"[Sliding Window] RandomForest RMSE: {rmse_window:.2f}")

import numpy as np
from sklearn.metrics import mean_squared_error

y_pred_window = model_window.predict(Xw_test)

mse_window = mean_squared_error(yw_test, y_pred_window)  
rmse_window = np.sqrt(mse_window)  # manually compute the root
print(f"[Sliding Window] RandomForest RMSE: {rmse_window:.2f}")


# Example: Predict next IO for the LAST 31 rows
if len(df) >= 31:
    new_chunk = df.iloc[-31:].values.flatten()
    new_feature = new_chunk[:-1].reshape(1, -1)
    predicted_io_window = model_window.predict(new_feature)
    print(f"Predicted next IO (sliding-window approach) = {predicted_io_window[0]:.2f}")


[Sliding Window] RandomForest RMSE: 17.33
Predicted next IO (sliding-window approach) = 14.71


In [6]:
# ============================
# 4) Single-CPU -> Next IO Approach
# ============================
X_single = df[['CPU_BURST']]  # single feature
y_single = df['IO_BURST']     # next IO

X1_train, X1_test, y1_train, y1_test = train_test_split(X_single, y_single,
    test_size=0.2, random_state=42)

model_single = RandomForestRegressor(n_estimators=100, random_state=42)
model_single.fit(X1_train, y1_train)

# y_pred_single = model_single.predict(X1_test)
# rmse_single = mean_squared_error(y1_test, y_pred_single, squared=False)
# print(f"[Single CPU->IO] RandomForest RMSE: {rmse_single:.2f}")

import numpy as np
from sklearn.metrics import mean_squared_error

y_pred_window = model_window.predict(Xw_test)

mse_window = mean_squared_error(yw_test, y_pred_window)  
rmse_window = np.sqrt(mse_window)  # manually compute the root
print(f"[Sliding Window] RandomForest RMSE: {rmse_window:.2f}")





# Predict next IO for a brand-new CPU burst of, say, 10
new_cpu_burst = np.array([[10.0]])
predicted_io_single = model_single.predict(new_cpu_burst)
print(f"For CPU=10, predicted next IO (single-cpu approach) = {predicted_io_single[0]:.2f}")

[Sliding Window] RandomForest RMSE: 17.33
For CPU=10, predicted next IO (single-cpu approach) = 24.80




# Simple one Cpu Burst to Predict the Next IO Burst

In [7]:
# ============================
# 5) Optional: Evaluate Multiple Models
# ============================
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5),
    'SVR(rbf)': SVR(kernel='rbf'),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [8]:
print("\n[Single-CPU->IO] Model Comparisons:")
for name, model in models.items():
    model.fit(X1_train, y1_train)
    preds = model.predict(X1_test)
    mse_val = mean_squared_error(y1_test, preds)
    rmse_val = np.sqrt(mse_val)
    print(f"  {name}: RMSE = {rmse_val:.2f}")

print("\n[Sliding Window] Model Comparisons:")
for name, model in models.items():
    model.fit(Xw_train, yw_train)
    preds = model.predict(Xw_test)
    mse_val = mean_squared_error(yw_test, preds)
    rmse_val = np.sqrt(mse_val)
    print(f"  {name}: RMSE = {rmse_val:.2f}")



[Single-CPU->IO] Model Comparisons:
  Linear Regression: RMSE = 21.58
  Ridge Regression: RMSE = 21.58
  Lasso Regression: RMSE = 21.58
  Elastic Net: RMSE = 21.58
  SVR(rbf): RMSE = 24.26
  DecisionTree: RMSE = 23.34
  RandomForest: RMSE = 22.81
  GradientBoosting: RMSE = 23.02

[Sliding Window] Model Comparisons:
  Linear Regression: RMSE = 19.02
  Ridge Regression: RMSE = 19.02
  Lasso Regression: RMSE = 18.98
  Elastic Net: RMSE = 19.00
  SVR(rbf): RMSE = 17.25
  DecisionTree: RMSE = 29.21
  RandomForest: RMSE = 17.33
  GradientBoosting: RMSE = 19.78


# Graphs

In [9]:
import matplotlib.pyplot as plt

# Create a box plot of the predictions
plt.figure(figsize=(10, 6))
plt.boxplot(y_pred)
plt.title('Box Plot of Predictions')
plt.ylabel('Predicted IO_BURST')
plt.savefig('box_plot.png')
plt.show()

# Plot 1: Scatter plot with regression line
plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual values')
plt.scatter(range(len(y_pred)), y_pred, color='red', label='Predicted values')
plt.plot(range(len(y_pred)), y_pred, color='red', linestyle='--')
plt.title('Scatter Plot with Regression Line')
plt.xlabel('Instance')
plt.ylabel('IO_BURST')
plt.legend()
plt.savefig('scatter_plot_with_regression_line.png')
plt.show()



# Plot 3: Prediction vs. Actual plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual IO_BURST")
plt.ylabel("Predicted IO_BURST")
plt.title("Actual vs. Predicted IO_BURST (Test Set)")
plt.savefig('prediction_vs_actual_plot.png')
plt.grid(True)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.tight_layout()
plt.show()

NameError: name 'y_pred' is not defined

<Figure size 1000x600 with 0 Axes>