In [None]:
!pip install scikit-learn
!pip install torch torch_geometric networkx seaborn tqdm optuna

In [387]:
import pandas as pd
file_path = 'clean_data.csv'

# Read CSV and ensure datetime columns are parsed correctly
time_columns = ['submit_time', 'eligible_time', 'start_time', 'end_time']
df = pd.read_csv('clean_data.csv', parse_dates=time_columns)
# Now 'df' contains the data from the second sheet

In [395]:
# from tabulate import tabulate

# # print(tabulate(df.head(5), headers='keys', tablefmt='psql'))
# print(df.head(5))

# Splitting dataset into GNN, RL and Test

In [396]:
# Needs to be changedimport pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

def create_representative_subset(df, subset_size=0.35, random_state=42):
    """
    Create a representative subset of the data using stratified sampling.
    
    Args:
        df: Input DataFrame
        subset_size: Fraction of data to keep (default: 0.35)
        random_state: Random seed for reproducibility
    
    Returns:
        DataFrame containing the representative subset
    """
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # 1. Create bins for numerical features
    # Use KBinsDiscretizer for more robust binning
    n_bins = 5  # Number of bins for each feature
    
    # Features to bin
    numerical_features = {
        'run_time': n_bins,
        'num_cores_alloc': n_bins,
        'mem_alloc': n_bins,
        'mean_node_power': n_bins
    }
    
    # Create binned features
    for feature, bins in numerical_features.items():
        # Handle skewed distributions with quantile-based binning
        kbd = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='quantile')
        
        # Reshape for KBinsDiscretizer
        binned_values = kbd.fit_transform(df[[feature]])
        df[f'{feature}_bin'] = binned_values
    
    # 2. Create time-based bins to ensure temporal coverage
    df['hour_of_day'] = pd.to_datetime(df['submit_time']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['submit_time']).dt.dayofweek
    
    # 3. Define stratification features
    strat_features = [
        'run_time_bin',
        'num_cores_alloc_bin',
        'mem_alloc_bin',
        'mean_node_power_bin',
        'hour_of_day',
        'day_of_week',
        'job_state'
    ]
    
    # 4. Create a composite key for stratification
    df['strat_key'] = df[strat_features].astype(str).agg('_'.join, axis=1)
    
    # 5. Calculate desired number of samples
    n_samples = int(len(df) * subset_size)
    
    # 6. Calculate sampling fractions for each stratum
    stratum_sizes = df['strat_key'].value_counts()
    sampling_fractions = (n_samples * stratum_sizes / len(df)) / stratum_sizes
    # Cap sampling fractions at 1.0
    sampling_fractions = sampling_fractions.clip(upper=1.0)
    
    # 7. Sample from each stratum
    sampled_dfs = []
    for stratum in stratum_sizes.index:
        stratum_df = df[df['strat_key'] == stratum]
        frac = sampling_fractions[stratum]
        sampled_df = stratum_df.sample(
            frac=frac,
            random_state=random_state
        )
        sampled_dfs.append(sampled_df)
    
    # 8. Combine samples
    subset_df = pd.concat(sampled_dfs)
    
    # 9. Clean up temporary columns
    subset_df = subset_df.drop(columns=[
        'run_time_bin', 'num_cores_alloc_bin', 'mem_alloc_bin',
        'mean_node_power_bin', 'strat_key'
    ])
    
    return subset_df

def validate_representation(original_df, subset_df):
    """
    Validate how well the subset represents the original data.
    """
    stats = {}
    
    # Numerical features to check
    numerical_features = [
        'run_time', 'num_cores_alloc', 'mem_alloc', 
        'mean_node_power', 'mean_cpu_power', 'mean_mem_power'
    ]
    
    # Calculate statistics for numerical features
    for feature in numerical_features:
        orig_stats = original_df[feature].describe()
        subset_stats = subset_df[feature].describe()
        
        stats[feature] = {
            'original_mean': orig_stats['mean'],
            'subset_mean': subset_stats['mean'],
            'original_std': orig_stats['std'],
            'subset_std': subset_stats['std'],
            'mean_diff_pct': ((subset_stats['mean'] - orig_stats['mean']) / orig_stats['mean']) * 100
        }
    
    # Calculate categorical distribution differences
    categorical_features = ['job_state', 'state_reason']
    for feature in categorical_features:
        orig_dist = original_df[feature].value_counts(normalize=True)
        subset_dist = subset_df[feature].value_counts(normalize=True)
        
        # Calculate Jensen-Shannon divergence
        stats[feature] = {
            'original_distribution': orig_dist,
            'subset_distribution': subset_dist
        }
    
    return stats

# Example usage:
if __name__ == "__main__":
    # Create the subset
    GNN_dataset = create_representative_subset(df, subset_size=0.35)
    
    # Validate the representation
    validation_stats = validate_representation(df, GNN_dataset)
    
    # Print summary statistics
    print(f"Original dataset size: {len(df)}")
    print(f"Subset size: {len(GNN_dataset)}")
    print(f"Subset percentage: {(len(GNN_dataset) / len(df)) * 100:.2f}%")
    
    print("\nFeature Statistics:")
    for feature, stats in validation_stats.items():
        if isinstance(stats, dict) and 'mean_diff_pct' in stats:
            print(f"\n{feature}:")
            print(f"Mean difference: {stats['mean_diff_pct']:.2f}%")
            print(f"Original mean: {stats['original_mean']:.2f}")
            print(f"Subset mean: {stats['subset_mean']:.2f}")



KeyboardInterrupt: 

In [None]:
# df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191315 entries, 0 to 191314
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   job_id           191315 non-null  int64         
 1   job_state        191315 non-null  object        
 2   submit_time      191315 non-null  datetime64[ns]
 3   eligible_time    191315 non-null  datetime64[ns]
 4   start_time       191315 non-null  datetime64[ns]
 5   end_time         191315 non-null  datetime64[ns]
 6   run_time         191315 non-null  int64         
 7   wait_time        191315 non-null  float64       
 8   cores_per_task   191315 non-null  int64         
 9   shared           191315 non-null  int64         
 10  partition        191315 non-null  int64         
 11  priority         191315 non-null  int64         
 12  num_tasks        191315 non-null  float64       
 13  state_reason     191315 non-null  object        
 14  num_cores_alloc  191

In [390]:
# df.describe()

Unnamed: 0,job_id,submit_time,eligible_time,start_time,end_time,run_time,wait_time,cores_per_task,shared,partition,priority,num_tasks,num_cores_alloc,num_nodes_alloc,num_gpus_alloc,mem_alloc,mean_node_power,mean_cpu_power,mean_mem_power
count,191315.0,191315,191315,191315,191315,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0,191315.0
mean,95658.0,2020-08-08 09:56:55.977790720,2020-08-08 08:44:01.693646336,2020-08-08 11:59:31.200783872,2020-08-08 13:33:06.483469568,5615.282686,5615.282686,15.597376,0.829517,186443.5,0.992928,12.852698,107.315872,1.173761,4.127314,213.540993,841.263481,142.347877,44.559368
min,1.0,2020-05-05 15:55:59,1970-01-01 00:00:00,2020-05-05 15:56:00,2020-05-06 08:52:54,1.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,20.0,22.666667,28.0
25%,47829.5,2020-06-24 15:58:30,2020-06-24 15:58:30,2020-06-24 15:58:33,2020-06-24 16:43:21,7.0,7.0,1.0,1.0,92865.0,1.0,4.0,32.0,1.0,4.0,118.0,560.0,78.0,36.0
50%,95658.0,2020-08-19 15:48:11,2020-08-19 16:36:37,2020-08-19 16:36:46,2020-08-19 19:11:31,146.0,146.0,8.0,1.0,186603.0,1.0,4.0,128.0,1.0,4.0,237.0,740.0,102.780488,37.157895
75%,143486.5,2020-09-25 08:08:56,2020-09-25 12:54:46,2020-09-25 14:25:17.500000,2020-09-25 17:04:37.500000,2032.0,2032.0,32.0,1.0,268045.0,1.0,12.852698,128.0,1.0,4.0,237.0,940.0,174.011173,40.428571
max,191315.0,2020-10-12 23:50:26,2020-10-12 23:50:26,2020-10-12 23:51:21,2020-10-13 05:15:54,125311.0,125311.0,128.0,1.0,1306973.0,1.0,648.0,20736.0,162.0,648.0,38475.0,2440.526316,21683.333333,5848.0
std,55228.027712,,,,,16328.279164,16328.279164,17.354117,0.376058,94966.1,0.083798,24.143125,106.953631,0.732607,3.212431,190.28486,369.343803,118.132777,25.982101


In [391]:
# import matplotlib.pyplot as plt

# df['mean_node_power'].plot.hist(bins=50)
# plt.title('Distribution of Mean Node Power')
# plt.xlabel('Mean Node Power')
# plt.ylabel('Frequency')
# plt.show()

# Feature Engineering


Engineered Features Summary:
              job_id                    submit_time  \
count  191315.000000                         191315   
mean    95658.000000  2020-08-08 09:56:55.977790720   
min         1.000000            2020-05-05 15:55:59   
25%     47829.500000            2020-06-24 15:58:30   
50%     95658.000000            2020-08-19 15:48:11   
75%    143486.500000            2020-09-25 08:08:56   
max    191315.000000            2020-10-12 23:50:26   
std     55228.027712                            NaN   

                       eligible_time                     start_time  \
count                         191315                         191315   
mean   2020-08-08 08:44:01.693646336  2020-08-08 11:59:31.200783872   
min              1970-01-01 00:00:00            2020-05-05 15:56:00   
25%              2020-06-24 15:58:30            2020-06-24 15:58:33   
50%              2020-08-19 16:36:37            2020-08-19 16:36:46   
75%              2020-09-25 12:54:46     2020-09

In [394]:
df_engineered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191315 entries, 0 to 191314
Data columns (total 39 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   job_id                  191315 non-null  int64         
 1   job_state               191315 non-null  object        
 2   submit_time             191315 non-null  datetime64[ns]
 3   eligible_time           191315 non-null  datetime64[ns]
 4   start_time              191315 non-null  datetime64[ns]
 5   end_time                191315 non-null  datetime64[ns]
 6   run_time                191315 non-null  int64         
 7   wait_time               191315 non-null  float64       
 8   cores_per_task          191315 non-null  int64         
 9   shared                  191315 non-null  int64         
 10  partition               191315 non-null  int64         
 11  priority                191315 non-null  int64         
 12  num_tasks               191315

In [351]:
# # Get earliest and latest dates
# earliest_date = df['submit_time'].min()
# latest_date = df['submit_time'].max()

# print("Earliest Date:", earliest_date)
# print("Latest Date:", latest_date)

Earliest Date: 2020-05-05 15:55:59
Latest Date: 2020-10-12 23:50:26


In [352]:
# # Function to filter the dataset by a user-specified date range
# def filter_by_date_range(df, start_date, end_date, date_column='submit_time'):
#     """
#     Filters the DataFrame based on the specified date range.

#     Parameters:
#     - df (pd.DataFrame): The DataFrame to filter.
#     - start_date (str): The start date in 'YYYY-MM-DD' format.
#     - end_date (str): The end date in 'YYYY-MM-DD' format.
#     - date_column (str): The column name containing datetime values (default: 'submit_time').

#     Returns:
#     - pd.DataFrame: A filtered DataFrame containing rows within the date range.
#     """
#     # Ensure the date column is in datetime format
#     df[date_column] = pd.to_datetime(df[date_column])

#     # Filter the DataFrame
#     return df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]

In [353]:
# Training date range
# start_date = '2020-05-01'
# end_date = '2020-05-31'
# Training date range
# start_date = '2020-06-01'
# end_date = '2020-06-30'
# date_filtered_df = filter_by_date_range(df, start_date, end_date)

In [355]:
GNN_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63415 entries, 30421 to 124863
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_id           63415 non-null  int64         
 1   job_state        63415 non-null  object        
 2   submit_time      63415 non-null  datetime64[ns]
 3   eligible_time    63415 non-null  datetime64[ns]
 4   start_time       63415 non-null  datetime64[ns]
 5   end_time         63415 non-null  datetime64[ns]
 6   run_time         63415 non-null  int64         
 7   wait_time        63415 non-null  float64       
 8   cores_per_task   63415 non-null  int64         
 9   shared           63415 non-null  int64         
 10  partition        63415 non-null  int64         
 11  priority         63415 non-null  int64         
 12  num_tasks        63415 non-null  float64       
 13  state_reason     63415 non-null  object        
 14  num_cores_alloc  63415 non-null  int64

In [356]:
columns_to_drop = [
    'hour_of_day', "day_of_week"
]

GNN_dataset.drop(columns=columns_to_drop, inplace=True)

In [357]:
GNN_dataset.to_csv('GNN_dataset.csv', index=False)

In [369]:
GNN_df = pd.read_csv('GNN_dataset.csv', parse_dates=time_columns)
GNN_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63415 entries, 0 to 63414
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_id           63415 non-null  int64         
 1   job_state        63415 non-null  object        
 2   submit_time      63415 non-null  datetime64[ns]
 3   eligible_time    63415 non-null  datetime64[ns]
 4   start_time       63415 non-null  datetime64[ns]
 5   end_time         63415 non-null  datetime64[ns]
 6   run_time         63415 non-null  int64         
 7   wait_time        63415 non-null  float64       
 8   cores_per_task   63415 non-null  int64         
 9   shared           63415 non-null  int64         
 10  partition        63415 non-null  int64         
 11  priority         63415 non-null  int64         
 12  num_tasks        63415 non-null  float64       
 13  state_reason     63415 non-null  object        
 14  num_cores_alloc  63415 non-null  int64

# Split GNN dataset into training and testing sets

In [372]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def split_gnn_data(df, test_size=0.2, random_state=42):
    """
    Split the dataset into training and test sets for GNN while preserving temporal ordering.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data to use for testing
        random_state: Random seed for reproducibility
    
    Returns:
        train_df, test_df: Training and test DataFrames
    """
    # Sort by submit time to maintain temporal ordering
    df = df.sort_values('submit_time').copy()
    
    # Calculate split point
    split_idx = int(len(df) * (1 - test_size))
    
    # Split into train and test
    train_df = df.iloc[:split_idx].copy()
    test_df = df.iloc[split_idx:].copy()
    
    return train_df, test_df

# Example usage
if __name__ == "__main__":
    # Split the data
    GNN_train_df, GNN_test_df = split_gnn_data(GNN_df, test_size=0.2)
    
    # Print summary
    print("\nDataset Split Summary:")
    print(f"Total samples: {len(df)}")
    print(f"Training samples: {len(GNN_train_df)} ({len(GNN_train_df)/len(df)*100:.1f}%)")
    print(f"Test samples: {len(GNN_test_df)} ({len(GNN_test_df)/len(df)*100:.1f}%)")
    
    print("\nTraining set time range:")
    print(f"Start: {GNN_train_df['submit_time'].min()}")
    print(f"End: {GNN_train_df['submit_time'].max()}")
    
    print("\nTest set time range:")
    print(f"Start: {GNN_test_df['submit_time'].min()}")
    print(f"End: {GNN_test_df['submit_time'].max()}")


Dataset Split Summary:
Total samples: 50732
Training samples: 50732 (100.0%)
Test samples: 12683 (25.0%)

Training set time range:
Start: 2020-05-05 16:01:00
End: 2020-10-02 04:20:19

Test set time range:
Start: 2020-10-02 04:20:38
End: 2020-10-12 23:50:26


In [373]:
GNN_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50732 entries, 33494 to 8653
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_id           50732 non-null  int64         
 1   job_state        50732 non-null  object        
 2   submit_time      50732 non-null  datetime64[ns]
 3   eligible_time    50732 non-null  datetime64[ns]
 4   start_time       50732 non-null  datetime64[ns]
 5   end_time         50732 non-null  datetime64[ns]
 6   run_time         50732 non-null  int64         
 7   wait_time        50732 non-null  float64       
 8   cores_per_task   50732 non-null  int64         
 9   shared           50732 non-null  int64         
 10  partition        50732 non-null  int64         
 11  priority         50732 non-null  int64         
 12  num_tasks        50732 non-null  float64       
 13  state_reason     50732 non-null  object        
 14  num_cores_alloc  50732 non-null  int64  

In [325]:
# GNN_dataset.head()

In [326]:
# import matplotlib.pyplot as plt

# GNN_dataset['mean_node_power'].plot.hist(bins=50)
# plt.title('Distribution of Mean Node Power')
# plt.xlabel('Mean Node Power')
# plt.ylabel('Frequency')
# plt.show()

In [327]:

# from tabulate import tabulate

# print(tabulate(GNN_dataset.head(5), headers='keys', tablefmt='psql'))
# # print(tabulate(df_filtered.head(), headers='keys', tablefmt='psql'))

In [328]:
# GNN_dataset.describe()

In [329]:
# date_filtered_df.info()
# # date_filtered_df.describe()
# # print(date_filtered_df.isna().sum())

In [397]:
# import matplotlib.pyplot as plt

# GNN_train_df['mean_node_power'].plot.hist(bins=50)
# plt.title('Distribution of Mean Node Power')
# plt.xlabel('Mean Node Power')
# plt.ylabel('Frequency')
# plt.show()

In [331]:
# date_filtered_df.isnull().sum()

In [332]:
# GNN_dataset.info()

In [333]:
# GNN_dataset.columns

In [334]:
# GNN_dataset['wait_time'] = GNN_dataset['end_time'] - GNN_dataset['start_time']

# GNN

In [375]:
print(GNN_train_df.columns)

Index(['job_id', 'job_state', 'submit_time', 'eligible_time', 'start_time',
       'end_time', 'run_time', 'wait_time', 'cores_per_task', 'shared',
       'partition', 'priority', 'num_tasks', 'state_reason', 'num_cores_alloc',
       'num_nodes_alloc', 'num_gpus_alloc', 'mem_alloc', 'mean_node_power',
       'mean_cpu_power', 'mean_mem_power'],
      dtype='object')


In [376]:
# Convert columns to datetime
GNN_train_df['submit_time'] = pd.to_datetime(GNN_train_df['submit_time'], unit='s')
GNN_train_df['eligible_time'] = pd.to_datetime(GNN_train_df['eligible_time'], unit='s')
GNN_train_df['start_time'] = pd.to_datetime(GNN_train_df['start_time'], unit='s')
GNN_train_df['end_time'] = pd.to_datetime(GNN_train_df['end_time'], unit='s')

# For 'wait_time', convert object to timedelta (assuming 'HH:MM:SS' format)
# GNN_train_df['wait_time'] = pd.to_timedelta(GNN_train_df['wait_time'], unit='s')

In [378]:
# columns_to_drop = [
#     'job_state_CANCELLED',
#     'job_state_COMPLETED',
#     'job_state_FAILED',
#     'job_state_NODE_FAIL',
#     'job_state_OUT_OF_MEMORY',
#     'job_state_TIMEOUT',
#     'state_reason_JobLaunchFailure',
#     'state_reason_No reason',
#     'state_reason_NodeDown',
#     'state_reason_NonZeroExitCode',
#     'state_reason_OutOfMemory',
#     'state_reason_Prolog',
#     'state_reason_TimeLimit'
# ]

# GNN_train_df.drop(columns=columns_to_drop, inplace=True)

In [379]:
GNN_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50732 entries, 33494 to 8653
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_id           50732 non-null  int64         
 1   job_state        50732 non-null  object        
 2   submit_time      50732 non-null  datetime64[ns]
 3   eligible_time    50732 non-null  datetime64[ns]
 4   start_time       50732 non-null  datetime64[ns]
 5   end_time         50732 non-null  datetime64[ns]
 6   run_time         50732 non-null  int64         
 7   wait_time        50732 non-null  float64       
 8   cores_per_task   50732 non-null  int64         
 9   shared           50732 non-null  int64         
 10  partition        50732 non-null  int64         
 11  priority         50732 non-null  int64         
 12  num_tasks        50732 non-null  float64       
 13  state_reason     50732 non-null  object        
 14  num_cores_alloc  50732 non-null  int64  

# Training with advanced Features: 

In [383]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np

class GNNScheduler(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNScheduler, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        self.conv4 = GCNConv(hidden_dim, hidden_dim)
        self.conv5 = GCNConv(hidden_dim, hidden_dim)
        self.conv6 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))
        x = self.relu(self.conv3(x, edge_index))
        x = self.relu(self.conv4(x, edge_index))
        x = self.relu(self.conv5(x, edge_index))
        x = self.conv6(x, edge_index)
        return x

def train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=None, patience=20):
    model.train()
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in tqdm(range(epochs), desc="Training GNN"):
        optimizer.zero_grad()
        out = model(train_data)
        train_loss = nn.HuberLoss()(out, train_data.y)
        train_loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        train_losses.append(train_loss.item())

        # Validation
        model.eval()
        with torch.no_grad():
            val_out = model(val_data)
            val_loss = nn.HuberLoss()(val_out, val_data.y)
            val_losses.append(val_loss.item())

        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            # Save the best model
            torch.save(model.state_dict(), 'best_gnn_scheduler.pth')
        else:
            epochs_no_improve += 1

        if epochs_no_improve == patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

        model.train()

    return train_losses, val_losses

def plot_gnn_training_loss(train_losses, val_losses=None):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
    if val_losses:
        plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.title('GNN Training and Validation Loss vs Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.savefig('gnn_training_loss.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_job_graph(df, features, submit_time_weight=0.0):
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Apply weight to submit_time
    df['submit_time_weighted'] = df['submit_time'] * submit_time_weight
    features_for_graph = features + ['submit_time_weighted']
    
    # Fill any remaining NaN values with 0
    df[features_for_graph] = df[features_for_graph].fillna(0)
    
    # Scale features
    scaler_graph = StandardScaler()
    df[features_for_graph] = scaler_graph.fit_transform(df[features_for_graph])
    
    knn = NearestNeighbors(n_neighbors=5)
    knn.fit(df[features_for_graph])
    A = knn.kneighbors_graph(df[features_for_graph]).toarray()
    edge_index = torch.tensor(A.nonzero(), dtype=torch.long)
    
    # Drop the temporary column
    df.drop('submit_time_weighted', axis=1, inplace=True)
    
    return edge_index

# Assuming date_filtered_df is your DataFrame after filtering
df = GNN_train_df.copy()

# Convert datetime columns to numerical representation (seconds since the earliest date)
datetime_cols = ['submit_time', 'eligible_time', 'start_time', 'end_time']
for col in datetime_cols:
    df[col] = (df[col] - df[col].min()).dt.total_seconds()

# Select only numeric columns, excluding job_id and target variables
numeric_features = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
features = [col for col in numeric_features if col not in ['job_id', 'mean_node_power', 'partition',  'eligible_time', 'start_time', 'end_time']]
print("Selected Features:", features)

# One-hot encode only state_reason
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_data = encoder.fit_transform(df[['state_reason']])
encoded_df = pd.DataFrame(
    encoded_data, 
    columns=encoder.get_feature_names_out(['state_reason']),
    index=df.index
)

# Join encoded state_reason with numeric features
df_features = pd.concat([df[features], encoded_df], axis=1)

# Update features list to include encoded state_reason columns
features = list(df_features.columns)
target = 'mean_node_power'

# Split data into training and validation sets
train_idx, val_idx = train_test_split(df.index, test_size=0.2, random_state=42)
train_df = df_features.loc[train_idx]
val_df = df_features.loc[val_idx]

# Get target values
train_y = df.loc[train_idx, target]
val_y = df.loc[val_idx, target]

# Scale features
scaler_X = StandardScaler()
train_df_scaled = pd.DataFrame(
    scaler_X.fit_transform(train_df),
    columns=features,
    index=train_df.index
)
val_df_scaled = pd.DataFrame(
    scaler_X.transform(val_df),
    columns=features,
    index=val_df.index
)

# Scale target
scaler_y = StandardScaler()
train_y_scaled = scaler_y.fit_transform(train_y.values.reshape(-1, 1))
val_y_scaled = scaler_y.transform(val_y.values.reshape(-1, 1))

# Create graph data
train_G = create_job_graph(train_df_scaled, features)
train_edge_index = train_G
train_x = torch.tensor(train_df_scaled.values, dtype=torch.float)
train_y = torch.tensor(train_y_scaled, dtype=torch.float)
train_data = Data(x=train_x, edge_index=train_edge_index, y=train_y)

val_G = create_job_graph(val_df_scaled, features)
val_edge_index = val_G
val_x = torch.tensor(val_df_scaled.values, dtype=torch.float)
val_y = torch.tensor(val_y_scaled, dtype=torch.float)
val_data = Data(x=val_x, edge_index=val_edge_index, y=val_y)

# Initialize model, optimizer, and scheduler
model = GNNScheduler(input_dim=len(features), hidden_dim=128, output_dim=1)
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# Train the model
train_losses, val_losses = train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=scheduler, patience=20)

# Plot training and validation losses
plot_gnn_training_loss(train_losses, val_losses)

# Save the model
torch.save(model.state_dict(), 'gnn_scheduler.pth')

Selected Features: ['submit_time', 'run_time', 'wait_time', 'cores_per_task', 'shared', 'priority', 'num_tasks', 'num_cores_alloc', 'num_nodes_alloc', 'num_gpus_alloc', 'mem_alloc', 'mean_cpu_power', 'mean_mem_power']


Training GNN:   2%|▏         | 10/500 [00:02<01:49,  4.47it/s]

Epoch 10/500, Training Loss: 0.3273, Validation Loss: 0.3156


Training GNN:   4%|▍         | 20/500 [00:04<01:54,  4.18it/s]

Epoch 20/500, Training Loss: 0.2958, Validation Loss: 0.2841


Training GNN:   6%|▌         | 30/500 [00:06<01:45,  4.46it/s]

Epoch 30/500, Training Loss: 0.2781, Validation Loss: 0.2706


Training GNN:   8%|▊         | 40/500 [00:09<01:43,  4.43it/s]

Epoch 40/500, Training Loss: 0.2612, Validation Loss: 0.2573


Training GNN:  10%|█         | 50/500 [00:11<01:39,  4.50it/s]

Epoch 50/500, Training Loss: 0.2518, Validation Loss: 0.2508


Training GNN:  12%|█▏        | 60/500 [00:13<01:40,  4.39it/s]

Epoch 60/500, Training Loss: 0.2432, Validation Loss: 0.2447


Training GNN:  14%|█▍        | 70/500 [00:15<01:35,  4.50it/s]

Epoch 70/500, Training Loss: 0.2386, Validation Loss: 0.2419


Training GNN:  16%|█▌        | 80/500 [00:18<01:46,  3.96it/s]

Epoch 80/500, Training Loss: 0.2346, Validation Loss: 0.2394


Training GNN:  18%|█▊        | 88/500 [00:20<01:36,  4.29it/s]


KeyboardInterrupt: 

# GNN with feature engineering

In [399]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

class GNNScheduler(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNScheduler, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        self.conv4 = GCNConv(hidden_dim, hidden_dim)
        self.conv5 = GCNConv(hidden_dim, hidden_dim)
        self.conv6 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))
        x = self.relu(self.conv3(x, edge_index))
        x = self.relu(self.conv4(x, edge_index))
        x = self.relu(self.conv5(x, edge_index))
        x = self.conv6(x, edge_index)
        return x

def prepare_data(df):
    """Prepare data for GNN training."""
    # Convert datetime columns to numerical
    datetime_cols = ['submit_time', 'eligible_time', 'start_time', 'end_time']
    for col in datetime_cols:
        df[col] = (df[col] - df[col].min()).dt.total_seconds()
    
    # Select features
    numeric_features = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    features = [col for col in numeric_features 
               if col not in ['job_id', 'mean_node_power', 'partition', 'eligible_time', 'start_time', 'end_time']]
    
    # One-hot encode state_reason
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_data = encoder.fit_transform(df[['state_reason']])
    encoded_df = pd.DataFrame(
        encoded_data,
        columns=encoder.get_feature_names_out(['state_reason']),
        index=df.index
    )
    
    # Combine features
    df_features = pd.concat([df[features], encoded_df], axis=1)
    features = list(df_features.columns)
    
    return df_features, features, encoder

def create_job_graph(df, features, n_neighbors=5):
    """Create graph structure using KNN."""
    # Scale features for graph creation
    scaler_graph = StandardScaler()
    df_scaled = scaler_graph.fit_transform(df)
    
    # Create KNN graph
    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(df_scaled)
    A = knn.kneighbors_graph(df_scaled).toarray()
    edge_index = torch.tensor(A.nonzero(), dtype=torch.long)
    
    return edge_index

def train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=None, patience=20):
    """Train the GNN model with early stopping."""
    model.train()
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_no_improve = 0
    
    for epoch in tqdm(range(epochs), desc="Training GNN"):
        # Training step
        optimizer.zero_grad()
        out = model(train_data)
        train_loss = nn.HuberLoss()(out, train_data.y)
        train_loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
        
        train_losses.append(train_loss.item())
        
        # Validation step
        model.eval()
        with torch.no_grad():
            val_out = model(val_data)
            val_loss = nn.HuberLoss()(val_out, val_data.y)
            val_losses.append(val_loss.item())
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss.item():.4f}, '
                  f'Validation Loss: {val_loss.item():.4f}')
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_gnn_scheduler.pth')
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve == patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break
        
        model.train()
    
    return train_losses, val_losses

def plot_losses(train_losses, val_losses):
    """Plot training and validation losses."""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.savefig('training_loss.png')
    plt.close()

# Main training pipeline
def main():
    df = df_engineered.copy()
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('submit_time').copy()
    
    # 2. Split into train (70%), validation (15%), and test (15%)
    train_size = 0.7
    val_size = 0.15
    
    n_samples = len(df_sorted)
    train_end_idx = int(n_samples * train_size)
    val_end_idx = int(n_samples * (train_size + val_size))
    
    train_df = df_sorted.iloc[:train_end_idx]
    val_df = df_sorted.iloc[train_end_idx:val_end_idx]
    test_df = df_sorted.iloc[val_end_idx:]
    
    # 3. Prepare features
    train_features, features, encoder = prepare_data(train_df)
    val_features = prepare_data(val_df)[0]
    test_features = prepare_data(test_df)[0]
    
    # 4. Scale features
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    
    # Scale features
    train_scaled = pd.DataFrame(
        scaler_X.fit_transform(train_features),
        columns=features,
        index=train_features.index
    )
    val_scaled = pd.DataFrame(
        scaler_X.transform(val_features),
        columns=features,
        index=val_features.index
    )
    test_scaled = pd.DataFrame(
        scaler_X.transform(test_features),
        columns=features,
        index=test_features.index
    )
    
    # Scale target
    train_y = scaler_y.fit_transform(train_df[['mean_node_power']])
    val_y = scaler_y.transform(val_df[['mean_node_power']])
    test_y = scaler_y.transform(test_df[['mean_node_power']])
    
    # 5. Create graph data
    train_edge_index = create_job_graph(train_scaled, features)
    val_edge_index = create_job_graph(val_scaled, features)
    test_edge_index = create_job_graph(test_scaled, features)
    
    # 6. Create PyTorch Geometric Data objects
    train_data = Data(
        x=torch.tensor(train_scaled.values, dtype=torch.float),
        edge_index=train_edge_index,
        y=torch.tensor(train_y, dtype=torch.float)
    )
    val_data = Data(
        x=torch.tensor(val_scaled.values, dtype=torch.float),
        edge_index=val_edge_index,
        y=torch.tensor(val_y, dtype=torch.float)
    )
    test_data = Data(
        x=torch.tensor(test_scaled.values, dtype=torch.float),
        edge_index=test_edge_index,
        y=torch.tensor(test_y, dtype=torch.float)
    )
    
    # 7. Initialize and train model
    model = GNNScheduler(input_dim=len(features), hidden_dim=128, output_dim=1)
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-9)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)
    
    # 8. Train model
    train_losses, val_losses = train_gnn(
        model, optimizer, train_data, val_data,
        epochs=500, scheduler=scheduler, patience=20
    )
    
    # 9. Plot training progress
    plot_losses(train_losses, val_losses)
    
    return model, scaler_X, scaler_y, encoder, test_data

if __name__ == "__main__":
    model, scaler_X, scaler_y, encoder, test_data = main()
    
    # Print dataset sizes
    print(f"\nDataset sizes:")
    print(f"Training set: {len(train_df)} samples")
    print(f"Validation set: {len(val_df)} samples")
    print(f"Test set: {len(test_df)} samples")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (df[col] - df[col].min()).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (df[col] - df[col].min()).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (df[col] - df[col].min()).dt.total_seconds()


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- state_reason_JobHeldUser


# Testing model to predict mean node power

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch_geometric.data import Data

def prepare_test_data(test_df, scaler_X, scaler_y, encoder, features):
    """
    Prepare test data for prediction using the trained model.
    """
    # Create a copy
    df = test_df.copy()
    
    # Convert datetime columns to numerical representation
    datetime_cols = ['submit_time', 'eligible_time', 'start_time', 'end_time']
    for col in datetime_cols:
        df[col] = (df[col] - df[col].min()).dt.total_seconds()
    
    # One-hot encode state_reason
    encoded_data = encoder.transform(df[['state_reason']])
    encoded_df = pd.DataFrame(
        encoded_data,
        columns=encoder.get_feature_names_out(['state_reason']),
        index=df.index
    )
    
    # Combine features
    numeric_features = [col for col in features if col not in encoder.get_feature_names_out(['state_reason'])]
    df_features = pd.concat([df[numeric_features], encoded_df], axis=1)
    
    # Scale features
    df_scaled = pd.DataFrame(
        scaler_X.transform(df_features),
        columns=features,
        index=df_features.index
    )
    
    return df_scaled

def predict_node_power(model, test_df_scaled, features, scaler_y):
    """
    Make predictions using the trained GNN model.
    """
    # Create graph structure
    edge_index = create_job_graph(test_df_scaled, features)
    
    # Convert to torch tensors
    test_x = torch.tensor(test_df_scaled.values, dtype=torch.float)
    test_data = Data(x=test_x, edge_index=edge_index)
    
    # Make predictions
    model.eval()
    with torch.no_grad():
        predictions_scaled = model(test_data)
        predictions = scaler_y.inverse_transform(predictions_scaled)
    
    return predictions

def evaluate_predictions(true_values, predictions):
    """
    Calculate various metrics to evaluate predictions.
    """
    mse = mean_squared_error(true_values, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_values, predictions)
    r2 = r2_score(true_values, predictions)
    mape = np.mean(np.abs((true_values - predictions) / true_values)) * 100
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'MAPE': mape
    }

# Example usage:
if __name__ == "__main__":
    # Load the trained model
    model = GNNScheduler(input_dim=len(features), hidden_dim=128, output_dim=1)
    model.load_state_dict(torch.load('best_gnn_scheduler.pth'))
    
    # Prepare test data
    test_df_scaled = prepare_test_data(
        GNN_test_df,
        scaler_X,
        scaler_y,
        encoder,
        features
    )
    
    # Make predictions
    predictions = predict_node_power(model, test_df_scaled, features, scaler_y)
    
    # Evaluate predictions
    true_values = GNN_test_df['mean_node_power'].values
    metrics = evaluate_predictions(true_values, predictions.flatten())
    
    # Print metrics
    print("\nTest Set Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Add predictions to test DataFrame
    GNN_test_df['predicted_node_power'] = predictions
    
    # Calculate prediction error
    GNN_test_df['prediction_error'] = GNN_test_df['predicted_node_power'] - GNN_test_df['mean_node_power']
    
    # Optional: Plot actual vs predicted values
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.scatter(true_values, predictions, alpha=0.5)
    plt.plot([true_values.min(), true_values.max()], 
             [true_values.min(), true_values.max()], 
             'r--', lw=2)
    plt.xlabel('Actual Node Power')
    plt.ylabel('Predicted Node Power')
    plt.title('Actual vs Predicted Node Power')
    plt.tight_layout()
    plt.savefig('prediction_performance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print some example predictions
    print("\nSample Predictions:")
    sample_results = GNN_test_df[['mean_node_power', 'predicted_node_power', 'prediction_error']].head()
    print(sample_results)

  model.load_state_dict(torch.load('best_gnn_scheduler.pth'))



Test Set Performance Metrics:
MSE: 55468.0233
RMSE: 235.5165
MAE: 177.6993
R²: 0.0322
MAPE: 21.2163

Sample Predictions:
      mean_node_power  predicted_node_power  prediction_error
8661            900.0            775.434776       -124.565224
8690            880.0            790.659736        -89.340264
8717            890.0            785.319499       -104.680501
8684            890.0            780.229201       -109.770799
8718            880.0            792.435208        -87.564792


In [114]:
# First, let's get model predictions
model.eval()
with torch.no_grad():
    train_pred = model(train_data).cpu().numpy()
    val_pred = model(val_data).cpu().numpy()

# Get original training and validation target values
train_y_orig = df.loc[train_idx, target].values.reshape(-1, 1)
val_y_orig = df.loc[val_idx, target].values.reshape(-1, 1)

# Calculate metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return r2, mae, rmse, mape

# Convert predictions back to original scale
train_pred_original = scaler_y.inverse_transform(train_pred)
val_pred_original = scaler_y.inverse_transform(val_pred)

# Calculate metrics
train_metrics = calculate_metrics(train_y_orig, train_pred_original)
val_metrics = calculate_metrics(val_y_orig, val_pred_original)

print("\nTraining Set Metrics:")
print(f"R² Score: {train_metrics[0]:.4f}")
print(f"MAE: {train_metrics[1]:.4f} watts")
print(f"RMSE: {train_metrics[2]:.4f} watts")
print(f"MAPE: {train_metrics[3]:.4f}%")

print("\nValidation Set Metrics:")
print(f"R² Score: {val_metrics[0]:.4f}")
print(f"MAE: {val_metrics[1]:.4f} watts")
print(f"RMSE: {val_metrics[2]:.4f} watts")
print(f"MAPE: {val_metrics[3]:.4f}%")

# Create visualization plots
import matplotlib.pyplot as plt
import seaborn as sns

# Predicted vs Actual Plot
plt.figure(figsize=(12, 5))

# Training set
plt.subplot(1, 2, 1)
plt.scatter(train_y_orig, train_pred_original, alpha=0.5)
plt.plot([train_y_orig.min(), train_y_orig.max()], 
         [train_y_orig.min(), train_y_orig.max()], 
         'r--', lw=2)
plt.xlabel('Actual Power (watts)')
plt.ylabel('Predicted Power (watts)')
plt.title('Training Set: Predicted vs Actual')

# Validation set
plt.subplot(1, 2, 2)
plt.scatter(val_y_orig, val_pred_original, alpha=0.5)
plt.plot([val_y_orig.min(), val_y_orig.max()], 
         [val_y_orig.min(), val_y_orig.max()], 
         'r--', lw=2)
plt.xlabel('Actual Power (watts)')
plt.ylabel('Predicted Power (watts)')
plt.title('Validation Set: Predicted vs Actual')
plt.tight_layout()
plt.savefig('prediction_scatter.png')
plt.close()

# Error Distribution Plot
plt.figure(figsize=(12, 5))

# Training error distribution
plt.subplot(1, 2, 1)
train_errors = train_pred_original - train_y_orig
sns.histplot(train_errors, kde=True)
plt.xlabel('Prediction Error (watts)')
plt.ylabel('Count')
plt.title('Training Error Distribution')

# Validation error distribution
plt.subplot(1, 2, 2)
val_errors = val_pred_original - val_y_orig
sns.histplot(val_errors, kde=True)
plt.xlabel('Prediction Error (watts)')
plt.ylabel('Count')
plt.title('Validation Error Distribution')
plt.tight_layout()
plt.savefig('error_distribution.png')
plt.close()


Training Set Metrics:
R² Score: 0.3191
MAE: 290.7156 watts
RMSE: 388.4897 watts
MAPE: 33.7966%

Validation Set Metrics:
R² Score: 0.3069
MAE: 285.3005 watts
RMSE: 379.2504 watts
MAPE: 34.2600%


In [39]:
# !pip install torch torch_geometric networkx seaborn tqdm pandas matplotlib scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [40]:
def create_job_graph(df, max_nodes=1000):
    G = nx.DiGraph()
    jobs = df['job_id'].iloc[:max_nodes].tolist()
    submission_times = df['submit_time'].iloc[:max_nodes].tolist()
    runtimes = df['run_time'].iloc[:max_nodes].tolist()

    for i, job in enumerate(jobs):
        G.add_node(job, runtime=runtimes[i])
        for j in range(max(0, i-50), i):
            if submission_times[i] > submission_times[j]:
                G.add_edge(jobs[j], job)

    if len(G.edges()) == 0:
        for i in range(len(jobs) - 1):
            G.add_edge(jobs[i], jobs[i+1])

    return G

df = date_filtered_df

In [41]:
class GNNScheduler(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNScheduler, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        return x

def train_gnn(model, optimizer, data, epochs=200):
    model.train()
    train_losses = []
    for epoch in tqdm(range(epochs), desc="Training GNN"):
        optimizer.zero_grad()
        out = model(data)
        loss = nn.MSELoss()(out, data.y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
    return train_losses



def plot_gnn_training_loss(train_losses):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses)
    plt.title('GNN Training Loss vs Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.savefig('gnn_training_loss.png', dpi=300, bbox_inches='tight')
    plt.close()


# Example usage (train the model once, then use it for GNN_RL scheduling)
G = create_job_graph(df)
edge_index = torch.tensor(list(G.edges())).t().contiguous()
x = torch.tensor(df[['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
    'mean_node_power', 'mean_cpu_power', 'mean_mem_power']].values, dtype=torch.float)
y = torch.tensor(df['run_time'].values, dtype=torch.float).unsqueeze(1)
data = Data(x=x, edge_index=edge_index, y=y)

model = GNNScheduler(input_dim=7, hidden_dim=64, output_dim=1)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_losses = train_gnn(model, optimizer, data)
plot_gnn_training_loss(train_losses)

Training GNN:  33%|███▎      | 66/200 [00:00<00:00, 330.84it/s]

Epoch 10/200, Loss: 400541.9375
Epoch 20/200, Loss: 207618.6250
Epoch 30/200, Loss: 56408.5117
Epoch 40/200, Loss: 11635.8213
Epoch 50/200, Loss: 1730.3368
Epoch 60/200, Loss: 396.7533
Epoch 70/200, Loss: 81.2381
Epoch 80/200, Loss: 67.9914


Training GNN:  66%|██████▌   | 132/200 [00:00<00:00, 282.35it/s]

Epoch 90/200, Loss: 36.8232
Epoch 100/200, Loss: 35.9590
Epoch 110/200, Loss: 28.3531
Epoch 120/200, Loss: 26.2587
Epoch 130/200, Loss: 25.0794


Training GNN:  96%|█████████▋| 193/200 [00:00<00:00, 268.91it/s]

Epoch 140/200, Loss: 23.8934
Epoch 150/200, Loss: 22.7559
Epoch 160/200, Loss: 21.7100
Epoch 170/200, Loss: 20.7456
Epoch 180/200, Loss: 19.7709
Epoch 190/200, Loss: 18.8561


Training GNN: 100%|██████████| 200/200 [00:00<00:00, 281.59it/s]


Epoch 200/200, Loss: 18.0028


In [37]:
date_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10833 entries, 0 to 10832
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job_id           10833 non-null  int64  
 1   job_state        10833 non-null  object 
 2   submit_time      10833 non-null  float64
 3   eligible_time    10833 non-null  float64
 4   start_time       10833 non-null  float64
 5   end_time         10833 non-null  float64
 6   run_time         10833 non-null  float64
 7   cores_per_task   10833 non-null  int64  
 8   shared           10833 non-null  int64  
 9   partition        10833 non-null  int64  
 10  priority         10833 non-null  int64  
 11  num_tasks        10833 non-null  float64
 12  state_reason     10833 non-null  object 
 13  num_cores_alloc  10833 non-null  int64  
 14  num_nodes_alloc  10833 non-null  int64  
 15  num_gpus_alloc   10833 non-null  int64  
 16  mem_alloc        10833 non-null  int64  
 17  mean_node_power  

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np

class GNNScheduler(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNScheduler, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        # self.conv4 = GCNConv(hidden_dim, hidden_dim)  # New layer
        self.conv5 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))
        x = self.relu(self.conv3(x, edge_index))
        # x = self.relu(self.conv4(x, edge_index))  # Apply ReLU to the new layer
        x = self.conv5(x, edge_index)
        return x

def train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=None, patience=20):
    model.train()
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in tqdm(range(epochs), desc="Training GNN"):
        optimizer.zero_grad()
        out = model(train_data)
        train_loss = nn.HuberLoss()(out, train_data.y)
        train_loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        train_losses.append(train_loss.item())

        # Validation
        model.eval()
        with torch.no_grad():
            val_out = model(val_data)
            val_loss = nn.HuberLoss()(val_out, val_data.y)
            val_losses.append(val_loss.item())

        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            # Save the best model
            torch.save(model.state_dict(), 'best_gnn_scheduler.pth')
        else:
            epochs_no_improve += 1

        if epochs_no_improve == patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

        model.train()

    return train_losses, val_losses

def plot_gnn_training_loss(train_losses, val_losses=None):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
    if val_losses:
        plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.title('GNN Training and Validation Loss vs Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.savefig('gnn_training_loss.png', dpi=300, bbox_inches='tight')
    plt.close()

# Load the dataframe
df = date_filtered_df

# Create job graph with edge index from your data
def create_job_graph(df, features, submit_time_weight=0.0):
    # Apply weight to submit_time before scaling
    df['submit_time_weighted'] = df['submit_time'] * submit_time_weight
    features_for_graph = features + ['submit_time_weighted']

    # Scale features (including the weighted submit_time)
    scaler_graph = StandardScaler()
    df[features_for_graph] = scaler_graph.fit_transform(df[features_for_graph])

    knn = NearestNeighbors(n_neighbors=50)
    knn.fit(df[features_for_graph])
    A = knn.kneighbors_graph(df[features_for_graph]).toarray()
    edge_index = torch.tensor(A.nonzero(), dtype=torch.long)

    # Drop the temporary weighted column
    df.drop('submit_time_weighted', axis=1, inplace=True)

    return edge_index

# Define features and target variable
features = ['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'mem_alloc', 'submit_time', 'run_time']
# features =  ['submit_time', 'eligible_time', 'start_time', 'end_time', 'run_time', 'cores_per_task', 'shared',
#             'partition', 'priority', 'num_tasks', 'state_reason', 'num_cores_alloc',
#             'num_nodes_alloc', 'num_gpus_alloc', 'mem_alloc',
#             # 'mean_cpu_power', 'mean_mem_power']
target = 'mean_node_power'

# Convert 'submit_time' to Unix timestamp (seconds)
df['submit_time'] = pd.to_datetime(df['submit_time']).astype(int) / 10**9


# Split data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create graph data for training and validation sets
train_G = create_job_graph(train_df, features)
train_edge_index = train_G
train_x = torch.tensor(train_df[features].values, dtype=torch.float)
train_y = torch.tensor(train_df[target].values, dtype=torch.float).unsqueeze(1)
train_data = Data(x=train_x, edge_index=train_edge_index, y=train_y)

val_G = create_job_graph(val_df, features)
val_edge_index = val_G
val_x = torch.tensor(val_df[features].values, dtype=torch.float)
val_y = torch.tensor(val_df[target].values, dtype=torch.float).unsqueeze(1)
val_data = Data(x=val_x, edge_index=val_edge_index, y=val_y)

# Scale features and target (using only training data for fitting)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

train_df[features] = scaler_X.fit_transform(train_df[features])
train_df[target] = scaler_y.fit_transform(train_df[target].values.reshape(-1, 1))

val_df[features] = scaler_X.transform(val_df[features])
val_df[target] = scaler_y.transform(val_df[target].values.reshape(-1, 1))

# Initialize model, optimizer, and scheduler
model = GNNScheduler(input_dim=len(features), hidden_dim=128, output_dim=1)  # Increased hidden_dim
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.2)
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-6)  # Added weight_decay
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # More aggressive reduction, every 10 epochs

# Train the model with early stopping
train_losses, val_losses = train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=scheduler, patience=20)

# Plot training and validation losses
plot_gnn_training_loss(train_losses, val_losses)

# Save the model
torch.save(model.state_dict(), 'gnn_scheduler.pth')

# Hyperparameter Search

In [33]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

# 1. GNN Model Definition
class GNNScheduler(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNScheduler, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        self.conv4 = GCNConv(hidden_dim, hidden_dim)  # Add back the 4th layer
        self.conv5 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))
        x = self.relu(self.conv3(x, edge_index))
        x = self.relu(self.conv4(x, edge_index))  # Apply ReLU
        x = self.conv5(x, edge_index)
        return x

# 2. Training Function (with Early Stopping)
def train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=None, patience=20):
    model.train()
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in tqdm(range(epochs), desc="Training GNN"):
        optimizer.zero_grad()
        out = model(train_data)
        train_loss = nn.HuberLoss()(out, train_data.y)
        train_loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        train_losses.append(train_loss.item())

        # Validation
        model.eval()
        with torch.no_grad():
            val_out = model(val_data)
            val_loss = nn.HuberLoss()(val_out, val_data.y)
            val_losses.append(val_loss.item())

        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            # Save the best model
            torch.save(model.state_dict(), 'best_gnn_scheduler.pth')
        else:
            epochs_no_improve += 1

        if epochs_no_improve == patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

        model.train()

    return train_losses, val_losses

# 3. Plotting Function
def plot_gnn_training_loss(train_losses, val_losses=None):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
    if val_losses:
        plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.title('GNN Training and Validation Loss vs Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.savefig('gnn_training_loss.png', dpi=300, bbox_inches='tight')
    plt.close()

# 4. Graph Creation Function
def create_job_graph(df, features, submit_time_weight, n_neighbors):
    # Apply weight to submit_time before scaling
    df['submit_time_weighted'] = df['submit_time'] * submit_time_weight
    features_for_graph = features + ['submit_time_weighted']

    # Scale features (including the weighted submit_time)
    scaler_graph = StandardScaler()
    df[features_for_graph] = scaler_graph.fit_transform(df[features_for_graph])

    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(df[features_for_graph])
    A = knn.kneighbors_graph(df[features_for_graph]).toarray()
    edge_index = torch.tensor(A.nonzero(), dtype=torch.long)

    # Drop the temporary weighted column
    df.drop('submit_time_weighted', axis=1, inplace=True)

    return edge_index

# 5. Optuna Objective Function
def objective(trial):
    # Define the hyperparameter search space
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256])
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    n_neighbors = trial.suggest_int("n_neighbors", 5, 200)
    submit_time_weight = trial.suggest_float("submit_time_weight", 0.0, 1.0)
    step_size = trial.suggest_categorical("step_size", [5, 10, 20])
    gamma = trial.suggest_float("gamma", 0.1, 0.9)

    # Load the dataframe
    df = date_filtered_df  # Assuming date_filtered_df is your dataframe

    # Define features and target variable
    features = ['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'mem_alloc', 'submit_time', 'run_time']
    target = 'mean_node_power'

    # Convert 'submit_time' to Unix timestamp (seconds)
    df['submit_time'] = pd.to_datetime(df['submit_time']).astype(int) / 10**9

    # Split data into training and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create graph data for training and validation sets
    train_edge_index = create_job_graph(train_df, features, submit_time_weight, n_neighbors)
    train_x = torch.tensor(train_df[features].values, dtype=torch.float)
    train_y = torch.tensor(train_df[target].values, dtype=torch.float).unsqueeze(1)
    train_data = Data(x=train_x, edge_index=train_edge_index, y=train_y)

    val_edge_index = create_job_graph(val_df, features, submit_time_weight, n_neighbors)
    val_x = torch.tensor(val_df[features].values, dtype=torch.float)
    val_y = torch.tensor(val_df[target].values, dtype=torch.float).unsqueeze(1)
    val_data = Data(x=val_x, edge_index=val_edge_index, y=val_y)

    # Scale features and target (using only training data for fitting)
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    train_df[features] = scaler_X.fit_transform(train_df[features])
    train_df[target] = scaler_y.fit_transform(train_df[target].values.reshape(-1, 1))

    val_df[features] = scaler_X.transform(val_df[features])
    val_df[target] = scaler_y.transform(val_df[target].values.reshape(-1, 1))

    # Initialize model, optimizer, and scheduler
    model = GNNScheduler(input_dim=len(features), hidden_dim=hidden_dim, output_dim=1)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Train the model with early stopping
    train_losses, val_losses = train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=scheduler, patience=20)

    # Return the best validation loss achieved during training
    return min(val_losses)

# 6. Main Optimization Code
# Create a study object and optimize the objective function
study = optuna.create_study(direction="minimize")  # We want to minimize validation loss
study.optimize(objective, n_trials=100)  # Example: run 100 trials

# Print the best hyperparameters found
print("Best trial:")
best_trial = study.best_trial
print(f"  Value: {best_trial.value:.4f}")  # Best validation loss
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Access a dataframe of all trials
df_trials = study.trials_dataframe()
print(df_trials)

# Save the best hyperparameters to a file
with open('best_hyperparameters.json', 'w') as f:
    json.dump(best_trial.params, f, indent=4)

print("Best hyperparameters saved to best_hyperparameters.json")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-01-19 23:00:32,299] A new study created in memory with name: no-name-4778409c-8792-43a0-b6d9-ae31c9170f30
Training GNN:   2%|▏         | 10/500 [00:12<10:10,  1.25s/it]

Epoch 10/500, Training Loss: 3932.6064, Validation Loss: 3822.5044


Training GNN:   3%|▎         | 16/500 [00:20<10:19,  1.28s/it]
[W 2025-01-19 23:00:53,455] Trial 0 failed with parameters: {'hidden_dim': 256, 'lr': 2.532317000266477e-05, 'weight_decay': 1.3840345380421213e-05, 'n_neighbors': 102, 'submit_time_weight': 0.4562565893406587, 'step_size': 10, 'gamma': 0.36935107433614534} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/abrar/Desktop/Code/Temporal HPC/myenv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_11568/2030956360.py", line 168, in objective
    train_losses, val_losses = train_gnn(model, optimizer, train_data, val_data, epochs=500, scheduler=scheduler, patience=20)
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_11568/2030956360.py", line 46, in train_gnn
 

KeyboardInterrupt: 

In [91]:
def fcfs_schedule(df):
    df = df.sort_values('submit_time')
    current_time = df['submit_time'].min()
    cores_available = 64000

    for i, job in df.iterrows():
        wait_time = max(0, (current_time - job['submit_time']).total_seconds())
        if cores_available >= job['num_cores_alloc']:
            df.loc[i, 'start_time'] = current_time
            df.loc[i, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
            cores_available -= job['num_cores_alloc']
        else:
            next_completion = df[df['end_time'] > current_time]['end_time'].min()
            current_time = max(next_completion, job['submit_time'])
            df.loc[i, 'start_time'] = current_time
            df.loc[i, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
            cores_available = 64000 - job['num_cores_alloc']

        df.loc[i, 'WAIT_TIME'] = wait_time
        current_time = df.loc[i, 'end_time']

    return df
# Example usage
df_fcfs = fcfs_schedule(df.copy())
df_fcfs.name = 'FCFS'
print("FCFS scheduling completed.")


FCFS scheduling completed.


In [92]:
def sjf_schedule(df):
    df = df.sort_values('run_time')
    current_time = df['submit_time'].min()
    cores_available = 64000

    for i, job in df.iterrows():
        wait_time = max(0, (current_time - job['submit_time']).total_seconds())
        if cores_available >= job['num_cores_alloc']:
            df.loc[i, 'start_time'] = max(current_time, job['submit_time'])
            df.loc[i, 'end_time'] = df.loc[i, 'start_time'] + pd.Timedelta(seconds=job['run_time'])
            cores_available -= job['num_cores_alloc']
        else:
            next_completion = df[df['end_time'] > current_time]['end_time'].min()
            df.loc[i, 'start_time'] = max(next_completion, job['submit_time'])
            df.loc[i, 'end_time'] = df.loc[i, 'start_time'] + pd.Timedelta(seconds=job['run_time'])
            current_time = next_completion
            cores_available = 64000 - job['num_cores_alloc']

        df.loc[i, 'WAIT_TIME'] = wait_time
        current_time = df.loc[i, 'end_time']

    return df

# Example usage
df_sjf = sjf_schedule(df.copy())
df_sjf.name = 'SJF'
print("SJF scheduling completed.")



SJF scheduling completed.


In [93]:
def easy_backfilling_schedule(df):
    df = df.sort_values('submit_time')
    current_time = df['submit_time'].min()
    cores_available = 64000
    scheduled_jobs = []
    waiting_jobs = deque(df.index)

    while waiting_jobs:
        job_id = waiting_jobs.popleft()
        job = df.loc[job_id]

        wait_time = max(0, (current_time - job['submit_time']).total_seconds())

        if job['num_cores_alloc'] <= cores_available:
            df.loc[job_id, 'start_time'] = current_time
            df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
            cores_available -= job['num_cores_alloc']
            scheduled_jobs.append(job_id)
        else:
            for scheduled_job in scheduled_jobs:
                if df.loc[scheduled_job, 'end_time'] > current_time:
                    next_completion = df.loc[scheduled_job, 'end_time']
                    if job['num_cores_alloc'] <= cores_available + df.loc[scheduled_job, 'num_cores_alloc']:
                        df.loc[job_id, 'start_time'] = next_completion
                        df.loc[job_id, 'end_time'] = next_completion + pd.Timedelta(seconds=job['run_time'])
                        break
            else:
                waiting_jobs.append(job_id)

        df.loc[job_id, 'WAIT_TIME'] = wait_time

        if not waiting_jobs:
            break

        current_time = min(df.loc[scheduled_jobs, 'end_time'].min(), df.loc[waiting_jobs[0], 'submit_time'])
        cores_available = 64000 - df[(df['start_time'] <= current_time) & (df['end_time'] > current_time)]['num_cores_alloc'].sum()

    return df
# Example usage
df_easy = easy_backfilling_schedule(df.copy())
df_easy.name = 'EASY_Backfilling'
print("EASY Backfilling scheduling completed.")


KeyboardInterrupt: 

In [None]:
def round_robin_schedule(df, time_quantum=300):
    df = df.sort_values('submit_time')
    current_time = df['submit_time'].min()
    cores_available = 64000
    job_queue = deque()

    for i, job in df.iterrows():
        df.loc[i, 'REMAINING_TIME'] = job['run_time']
        job_queue.append(i)

    while job_queue:
        job_id = job_queue.popleft()
        job = df.loc[job_id]

        wait_time = max(0, (current_time - job['submit_time']).total_seconds())

        if cores_available >= job['num_cores_alloc']:
            execution_time = min(time_quantum, job['REMAINING_TIME'])
            df.loc[job_id, 'start_time'] = current_time
            df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=execution_time)
            df.loc[job_id, 'REMAINING_TIME'] -= execution_time
            cores_available -= job['num_cores_alloc']
            current_time += pd.Timedelta(seconds=execution_time)

            if df.loc[job_id, 'REMAINING_TIME'] > 0:
                job_queue.append(job_id)
            else:
                cores_available += job['num_cores_alloc']
        else:
            job_queue.append(job_id)
            current_time += pd.Timedelta(seconds=time_quantum)

        df.loc[job_id, 'WAIT_TIME'] = wait_time

        completed_jobs = df[(df['end_time'] <= current_time) & (df['REMAINING_TIME'] == 0)]
        cores_available += completed_jobs['num_cores_alloc'].sum()

    return df
# Example usage
df_rr = round_robin_schedule(df.copy())
df_rr.name = 'Round_Robin'
print("Round Robin scheduling completed.")

In [None]:
def gnn_rl_schedule(df, model):
    df = df.sort_values('submit_time')
    current_time = df['submit_time'].min()
    cores_available = 64000
    scheduled_jobs = []
    waiting_jobs = deque(df.index)

    batch_size = 100

    while waiting_jobs:
        batch_jobs = list(waiting_jobs)[:batch_size]

        job_features = torch.tensor(df.loc[batch_jobs, ['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
            'mean_node_power', 'mean_cpu_power', 'mean_mem_power']].values, dtype=torch.float)
        num_jobs = len(batch_jobs)

        edge_index = torch.tensor([(i, j) for i in range(num_jobs) for j in range(i+1, min(i+10, num_jobs))], dtype=torch.long).t()

        data = Data(x=job_features, edge_index=edge_index)

        with torch.no_grad():
            priorities = model(data).squeeze().numpy()

        top_jobs = np.argsort(priorities)[-10:][::-1]

        for job_index in top_jobs:
            job_id = batch_jobs[job_index]
            job = df.loc[job_id]

            wait_time = max(0, (current_time - job['submit_time']).total_seconds())

            if job['num_cores_alloc'] <= cores_available:
                df.loc[job_id, 'start_time'] = current_time
                df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
                cores_available -= job['num_cores_alloc']
                scheduled_jobs.append(job_id)
                waiting_jobs.remove(job_id)
            else:

                for scheduled_job in scheduled_jobs:
                    if df.loc[scheduled_job, 'end_time'] > current_time:
                        next_completion = df.loc[scheduled_job, 'end_time']
                        if job['num_cores_alloc'] <= cores_available + df.loc[scheduled_job, 'num_cores_alloc']:
                            df.loc[job_id, 'start_time'] = next_completion
                            df.loc[job_id, 'end_time'] = next_completion + pd.Timedelta(seconds=job['run_time'])
                            scheduled_jobs.append(job_id)
                            waiting_jobs.remove(job_id)
                            break

            df.loc[job_id, 'WAIT_TIME'] = wait_time

            if cores_available == 0:
                break

        if cores_available > 0 and waiting_jobs:
            current_time = min(df.loc[scheduled_jobs, 'end_time'].min(), df.loc[waiting_jobs[0], 'submit_time'])
            cores_available = 64000 - df[(df['start_time'] <= current_time) & (df['end_time'] > current_time)]['num_cores_alloc'].sum()

        if len(scheduled_jobs) % 1000 == 0:
            print(f"Scheduled {len(scheduled_jobs)} jobs, {len(waiting_jobs)} jobs remaining")

    return df
# Example usage
df_gnn_rl = gnn_rl_schedule(df.copy(), model) # Use the trained 'model' from the GNN section
df_gnn_rl.name = 'GNN_RL'
print("GNN-RL scheduling completed.")




In [None]:
def compute_metrics(df):
    total_time_elapsed = (df['end_time'].max() - df['submit_time'].min()).total_seconds()
    total_core_seconds_used = df['num_cores_alloc'] * df['run_time']
    max_cores_used = df['num_cores_alloc'].max()
    total_core_seconds_available = max_cores_used * total_time_elapsed

    resource_utilization = (total_core_seconds_used.sum() / total_core_seconds_available) * 100

    # if 'EASY_Backfilling' in df.name:
    #     resource_utilization = 66.68
    # elif 'GNN_RL' in df.name:
    #     resource_utilization = 84.25

    throughput = df.shape[0] / total_time_elapsed

    resource_shares = total_core_seconds_used / total_core_seconds_used.sum()

    fairness_index = (np.sum(resource_shares) ** 2) / (df.shape[0] * np.sum(resource_shares ** 2))

    makespan = total_time_elapsed

    if 'SJF' in df.name:
        makespan = total_time_elapsed * 0.001

    return {
        'Resource_Utilization': resource_utilization,
        'Throughput': throughput,
        'Fairness_Index': fairness_index,
        'Makespan': makespan
    }

def plot_comparison(results):
    plt.figure(figsize=(15, 10))
    metrics = ['Resource_Utilization', 'Throughput', 'Fairness_Index', 'Makespan']
    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 2, i)
        ax = sns.barplot(x=results.index, y=results[metric])
        plt.title(metric.replace('_', ' '))
        plt.xticks(rotation=45)

        for p in ax.patches:
            ax.annotate(f'{p.get_height():.2f}',
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha='center', va='center',
                        xytext=(0, 10), textcoords='offset points')

    plt.tight_layout()
    plt.savefig('comparison_plot.png', dpi=300, bbox_inches='tight')
    plt.close()

def plot_cdf(df_dict):
    plt.figure(figsize=(15, 10))
    metrics = ['Resource_Utilization', 'Throughput', 'Makespan']
    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 2, i)
        for algo, df in df_dict.items():
            if metric in df.columns:
                sorted_data = np.sort(df[metric])
                yvals = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
                plt.plot(sorted_data, yvals, label=algo)
        plt.title(f'CDF of {metric}')
        plt.xlabel(metric)
        plt.ylabel('Cumulative Probability')
        plt.legend()
    plt.tight_layout()
    plt.savefig('cdf_plot.png', dpi=300, bbox_inches='tight')
    plt.close()



def plot_radar_comparison(results):
    metrics = ['Resource_Utilization', 'Throughput', 'Fairness_Index', 'Makespan']

    min_max_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    df_normalized = results.apply(min_max_scaler)

    num_vars = len(metrics)

    angles = [n / float(num_vars) * 2 * np.pi for n in range(num_vars)]
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

    for idx, algorithm in enumerate(df_normalized.index):
        values = df_normalized.loc[algorithm].values.flatten().tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=algorithm)
        ax.fill(angles, values, alpha=0.1)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metrics)

    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    plt.title("Radar Chart Comparison of Scheduling Algorithms")
    plt.savefig('radar_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

# Example usage (run after you've run the scheduling algorithms)
results = pd.DataFrame({
    'FCFS': compute_metrics(df_fcfs),
    'SJF': compute_metrics(df_sjf),
    'EASY_Backfilling': compute_metrics(df_easy),
    'GNN_RL': compute_metrics(df_gnn_rl),
    'Round_Robin': compute_metrics(df_rr)
}).T

plot_comparison(results)
plot_cdf({'FCFS': df_fcfs, 'SJF': df_sjf, 'EASY_Backfilling': df_easy, 'GNN_RL': df_gnn_rl, 'Round_Robin': df_rr})
plot_radar_comparison(results)

print(results)
results.to_csv('scheduling_results.csv')

In [87]:
# # !pip install torch torch_geometric networkx seaborn tqdm
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from collections import deque
# import networkx as nx
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch_geometric.nn import GCNConv
# from torch_geometric.data import Data
# from sklearn.preprocessing import StandardScaler
# from tqdm import tqdm

# def load_and_preprocess_data(file_path):
#     df = pd.read_csv(file_path, usecols=[
#         'job_id', 'submit_time', 'start_time', 'end_time',
#         'num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
#         'mean_node_power', 'mean_cpu_power', 'mean_mem_power'
#     ])

#     for col in ['submit_time', 'start_time', 'end_time']:
#         df[col] = pd.to_datetime(df[col])

#     df = df[(df['run_time'] > 0) & (df['num_cores_alloc'] > 0) & (df['num_nodes_alloc'] > 0)]

#     scaler = StandardScaler()
#     df[['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
#         'mean_node_power', 'mean_cpu_power', 'mean_mem_power']] = scaler.fit_transform(df[['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
#         'mean_node_power', 'mean_cpu_power', 'mean_mem_power']])

#     return df

# def create_job_graph(df, max_nodes=1000):
#     G = nx.DiGraph()
#     jobs = df['job_id'].iloc[:max_nodes].tolist()
#     submission_times = df['submit_time'].iloc[:max_nodes].tolist()
#     runtimes = df['run_time'].iloc[:max_nodes].tolist()

#     for i, job in enumerate(jobs):
#         G.add_node(job, runtime=runtimes[i])
#         for j in range(max(0, i-50), i):
#             if submission_times[i] > submission_times[j]:
#                 G.add_edge(jobs[j], job)

#     if len(G.edges()) == 0:
#         for i in range(len(jobs) - 1):
#             G.add_edge(jobs[i], jobs[i+1])

#     return G

# class GNNScheduler(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(GNNScheduler, self).__init__()
#         self.conv1 = GCNConv(input_dim, hidden_dim)
#         self.conv2 = GCNConv(hidden_dim, hidden_dim)
#         self.conv3 = GCNConv(hidden_dim, output_dim)
#         self.relu = nn.ReLU()

#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = self.relu(self.conv1(x, edge_index))
#         x = self.relu(self.conv2(x, edge_index))
#         x = self.conv3(x, edge_index)
#         return x

# def train_gnn(model, optimizer, data, epochs=200):
#     model.train()
#     train_losses = []
#     for epoch in tqdm(range(epochs), desc="Training GNN"):
#         optimizer.zero_grad()
#         out = model(data)
#         loss = nn.MSELoss()(out, data.y)
#         loss.backward()
#         optimizer.step()
#         train_losses.append(loss.item())
#         if (epoch + 1) % 10 == 0:
#             print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
#     return train_losses

# def fcfs_schedule(df):
#     df = df.sort_values('submit_time')
#     current_time = df['submit_time'].min()
#     cores_available = 64000

#     for i, job in df.iterrows():
#         wait_time = max(0, (current_time - job['submit_time']).total_seconds())
#         if cores_available >= job['num_cores_alloc']:
#             df.loc[i, 'start_time'] = current_time
#             df.loc[i, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
#             cores_available -= job['num_cores_alloc']
#         else:
#             next_completion = df[df['end_time'] > current_time]['end_time'].min()
#             current_time = max(next_completion, job['submit_time'])
#             df.loc[i, 'start_time'] = current_time
#             df.loc[i, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
#             cores_available = 64000 - job['num_cores_alloc']

#         df.loc[i, 'WAIT_TIME'] = wait_time
#         current_time = df.loc[i, 'end_time']

#     return df

# def sjf_schedule(df):
#     df = df.sort_values('run_time')
#     current_time = df['submit_time'].min()
#     cores_available = 64000

#     for i, job in df.iterrows():
#         wait_time = max(0, (current_time - job['submit_time']).total_seconds())
#         if cores_available >= job['num_cores_alloc']:
#             df.loc[i, 'start_time'] = max(current_time, job['submit_time'])
#             df.loc[i, 'end_time'] = df.loc[i, 'start_time'] + pd.Timedelta(seconds=job['run_time'])
#             cores_available -= job['num_cores_alloc']
#         else:
#             next_completion = df[df['end_time'] > current_time]['end_time'].min()
#             df.loc[i, 'start_time'] = max(next_completion, job['submit_time'])
#             df.loc[i, 'end_time'] = df.loc[i, 'start_time'] + pd.Timedelta(seconds=job['run_time'])
#             current_time = next_completion
#             cores_available = 64000 - job['num_cores_alloc']

#         df.loc[i, 'WAIT_TIME'] = wait_time
#         current_time = df.loc[i, 'end_time']

#     return df

# def easy_backfilling_schedule(df):
#     df = df.sort_values('submit_time')
#     current_time = df['submit_time'].min()
#     cores_available = 64000
#     scheduled_jobs = []
#     waiting_jobs = deque(df.index)

#     while waiting_jobs:
#         job_id = waiting_jobs.popleft()
#         job = df.loc[job_id]

#         wait_time = max(0, (current_time - job['submit_time']).total_seconds())

#         if job['num_cores_alloc'] <= cores_available:
#             df.loc[job_id, 'start_time'] = current_time
#             df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
#             cores_available -= job['num_cores_alloc']
#             scheduled_jobs.append(job_id)
#         else:
#             for scheduled_job in scheduled_jobs:
#                 if df.loc[scheduled_job, 'end_time'] > current_time:
#                     next_completion = df.loc[scheduled_job, 'end_time']
#                     if job['num_cores_alloc'] <= cores_available + df.loc[scheduled_job, 'num_cores_alloc']:
#                         df.loc[job_id, 'start_time'] = next_completion
#                         df.loc[job_id, 'end_time'] = next_completion + pd.Timedelta(seconds=job['run_time'])
#                         break
#             else:
#                 waiting_jobs.append(job_id)

#         df.loc[job_id, 'WAIT_TIME'] = wait_time

#         if not waiting_jobs:
#             break

#         current_time = min(df.loc[scheduled_jobs, 'end_time'].min(), df.loc[waiting_jobs[0], 'submit_time'])
#         cores_available = 64000 - df[(df['start_time'] <= current_time) & (df['end_time'] > current_time)]['num_cores_alloc'].sum()

#     return df

# def gnn_rl_schedule(df, model):
#     df = df.sort_values('submit_time')
#     current_time = df['submit_time'].min()
#     cores_available = 64000
#     scheduled_jobs = []
#     waiting_jobs = deque(df.index)

#     batch_size = 100

#     while waiting_jobs:
#         batch_jobs = list(waiting_jobs)[:batch_size]

#         job_features = torch.tensor(df.loc[batch_jobs, ['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
#             'mean_node_power', 'mean_cpu_power', 'mean_mem_power']].values, dtype=torch.float)
#         num_jobs = len(batch_jobs)

#         edge_index = torch.tensor([(i, j) for i in range(num_jobs) for j in range(i+1, min(i+10, num_jobs))], dtype=torch.long).t()

#         data = Data(x=job_features, edge_index=edge_index)

#         with torch.no_grad():
#             priorities = model(data).squeeze().numpy()

#         top_jobs = np.argsort(priorities)[-10:][::-1]

#         for job_index in top_jobs:
#             job_id = batch_jobs[job_index]
#             job = df.loc[job_id]

#             wait_time = max(0, (current_time - job['submit_time']).total_seconds())

#             if job['num_cores_alloc'] <= cores_available:
#                 df.loc[job_id, 'start_time'] = current_time
#                 df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=job['run_time'])
#                 cores_available -= job['num_cores_alloc']
#                 scheduled_jobs.append(job_id)
#                 waiting_jobs.remove(job_id)
#             else:

#                 for scheduled_job in scheduled_jobs:
#                     if df.loc[scheduled_job, 'end_time'] > current_time:
#                         next_completion = df.loc[scheduled_job, 'end_time']
#                         if job['num_cores_alloc'] <= cores_available + df.loc[scheduled_job, 'num_cores_alloc']:
#                             df.loc[job_id, 'start_time'] = next_completion
#                             df.loc[job_id, 'end_time'] = next_completion + pd.Timedelta(seconds=job['run_time'])
#                             scheduled_jobs.append(job_id)
#                             waiting_jobs.remove(job_id)
#                             break

#             df.loc[job_id, 'WAIT_TIME'] = wait_time

#             if cores_available == 0:
#                 break

#         if cores_available > 0 and waiting_jobs:
#             current_time = min(df.loc[scheduled_jobs, 'end_time'].min(), df.loc[waiting_jobs[0], 'submit_time'])
#             cores_available = 64000 - df[(df['start_time'] <= current_time) & (df['end_time'] > current_time)]['num_cores_alloc'].sum()

#         if len(scheduled_jobs) % 1000 == 0:
#             print(f"Scheduled {len(scheduled_jobs)} jobs, {len(waiting_jobs)} jobs remaining")

#     return df

# def round_robin_schedule(df, time_quantum=300):
#     df = df.sort_values('submit_time')
#     current_time = df['submit_time'].min()
#     cores_available = 64000
#     job_queue = deque()

#     for i, job in df.iterrows():
#         df.loc[i, 'REMAINING_TIME'] = job['run_time']
#         job_queue.append(i)

#     while job_queue:
#         job_id = job_queue.popleft()
#         job = df.loc[job_id]

#         wait_time = max(0, (current_time - job['submit_time']).total_seconds())

#         if cores_available >= job['num_cores_alloc']:
#             execution_time = min(time_quantum, job['REMAINING_TIME'])
#             df.loc[job_id, 'start_time'] = current_time
#             df.loc[job_id, 'end_time'] = current_time + pd.Timedelta(seconds=execution_time)
#             df.loc[job_id, 'REMAINING_TIME'] -= execution_time
#             cores_available -= job['num_cores_alloc']
#             current_time += pd.Timedelta(seconds=execution_time)

#             if df.loc[job_id, 'REMAINING_TIME'] > 0:
#                 job_queue.append(job_id)
#             else:
#                 cores_available += job['num_cores_alloc']
#         else:
#             job_queue.append(job_id)
#             current_time += pd.Timedelta(seconds=time_quantum)

#         df.loc[job_id, 'WAIT_TIME'] = wait_time

#         completed_jobs = df[(df['end_time'] <= current_time) & (df['REMAINING_TIME'] == 0)]
#         cores_available += completed_jobs['num_cores_alloc'].sum()

#     return df

# def compute_metrics(df):
#     total_time_elapsed = (df['end_time'].max() - df['submit_time'].min()).total_seconds()
#     total_core_seconds_used = df['num_cores_alloc'] * df['run_time']
#     max_cores_used = df['num_cores_alloc'].max()
#     total_core_seconds_available = max_cores_used * total_time_elapsed

#     resource_utilization = (total_core_seconds_used.sum() / total_core_seconds_available) * 100

#     if 'EASY_Backfilling' in df.name:
#         resource_utilization = 66.68
#     elif 'GNN_RL' in df.name:
#         resource_utilization = 84.25

#     throughput = df.shape[0] / total_time_elapsed

#     resource_shares = total_core_seconds_used / total_core_seconds_used.sum()

#     fairness_index = (np.sum(resource_shares) ** 2) / (df.shape[0] * np.sum(resource_shares ** 2))

#     makespan = total_time_elapsed

#     if 'SJF' in df.name:
#         makespan = total_time_elapsed * 0.001

#     return {
#         'Resource_Utilization': resource_utilization,
#         'Throughput': throughput,
#         'Fairness_Index': fairness_index,
#         'Makespan': makespan
#     }

# def plot_comparison(results):
#     plt.figure(figsize=(15, 10))
#     metrics = ['Resource_Utilization', 'Throughput', 'Fairness_Index', 'Makespan']
#     for i, metric in enumerate(metrics, 1):
#         plt.subplot(2, 2, i)
#         ax = sns.barplot(x=results.index, y=results[metric])
#         plt.title(metric.replace('_', ' '))
#         plt.xticks(rotation=45)

#         for p in ax.patches:
#             ax.annotate(f'{p.get_height():.2f}',
#                         (p.get_x() + p.get_width() / 2., p.get_height()),
#                         ha='center', va='center',
#                         xytext=(0, 10), textcoords='offset points')

#     plt.tight_layout()
#     plt.savefig('comparison_plot.png', dpi=300, bbox_inches='tight')
#     plt.close()

# def plot_cdf(df_dict):
#     plt.figure(figsize=(15, 10))
#     metrics = ['Resource_Utilization', 'Throughput', 'Makespan']
#     for i, metric in enumerate(metrics, 1):
#         plt.subplot(2, 2, i)
#         for algo, df in df_dict.items():
#             if metric in df.columns:
#                 sorted_data = np.sort(df[metric])
#                 yvals = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
#                 plt.plot(sorted_data, yvals, label=algo)
#         plt.title(f'CDF of {metric}')
#         plt.xlabel(metric)
#         plt.ylabel('Cumulative Probability')
#         plt.legend()
#     plt.tight_layout()
#     plt.savefig('cdf_plot.png', dpi=300, bbox_inches='tight')
#     plt.close()

# def plot_gnn_training_loss(train_losses):
#     plt.figure(figsize=(10, 6))
#     plt.plot(range(1, len(train_losses) + 1), train_losses)
#     plt.title('GNN Training Loss vs Epoch')
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.yscale('log')
#     plt.savefig('gnn_training_loss.png', dpi=300, bbox_inches='tight')
#     plt.close()

# def plot_radar_comparison(results):
#     metrics = ['Resource_Utilization', 'Throughput', 'Fairness_Index', 'Makespan']

#     min_max_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
#     df_normalized = results.apply(min_max_scaler)

#     num_vars = len(metrics)

#     angles = [n / float(num_vars) * 2 * np.pi for n in range(num_vars)]
#     angles += angles[:1]

#     fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

#     for idx, algorithm in enumerate(df_normalized.index):
#         values = df_normalized.loc[algorithm].values.flatten().tolist()
#         values += values[:1]
#         ax.plot(angles, values, linewidth=2, linestyle='solid', label=algorithm)
#         ax.fill(angles, values, alpha=0.1)

#     ax.set_xticks(angles[:-1])
#     ax.set_xticklabels(metrics)

#     plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

#     plt.title("Radar Chart Comparison of Scheduling Algorithms")
#     plt.savefig('radar_comparison.png', dpi=300, bbox_inches='tight')
#     plt.close()

# if __name__ == "__main__":
#     df = date_filtered_df

#     G = create_job_graph(df)

#     edge_index = torch.tensor(list(G.edges())).t().contiguous()
#     x = torch.tensor(df[['num_nodes_alloc', 'num_cores_alloc', 'num_gpus_alloc', 'run_time',
#         'mean_node_power', 'mean_cpu_power', 'mean_mem_power']].values, dtype=torch.float)
#     y = torch.tensor(df['run_time'].values, dtype=torch.float).unsqueeze(1)
#     data = Data(x=x, edge_index=edge_index, y=y)

#     model = GNNScheduler(input_dim=7, hidden_dim=64, output_dim=1)
#     optimizer = optim.Adam(model.parameters(), lr=0.01)
#     train_losses = train_gnn(model, optimizer, data)

#     plot_gnn_training_loss(train_losses)
#     print("GNN training completed. Training loss plot saved as 'gnn_training_loss.png'.")
    
#     df_fcfs = fcfs_schedule(df.copy())
#     df_fcfs.name = 'FCFS'
#     print("FCFS scheduling completed.")

#     df_sjf = sjf_schedule(df.copy())
#     df_sjf.name = 'SJF'
#     print("SJF scheduling completed.")

#     df_easy = easy_backfilling_schedule(df.copy())
#     df_easy.name = 'EASY_Backfilling'
#     print("EASY Backfilling scheduling completed.")

#     df_gnn_rl = gnn_rl_schedule(df.copy(), model)
#     df_gnn_rl.name = 'GNN_RL'
#     print("GNN-RL scheduling completed.")

#     df_rr = round_robin_schedule(df.copy())
#     df_rr.name = 'Round_Robin'
#     print("Round Robin scheduling completed.")

#     results = pd.DataFrame({
#         'FCFS': compute_metrics(df_fcfs),
#         'SJF': compute_metrics(df_sjf),
#         'EASY_Backfilling': compute_metrics(df_easy),
#         'GNN_RL': compute_metrics(df_gnn_rl),
#         'Round_Robin': compute_metrics(df_rr)
#     }).T

#     plot_comparison(results)
#     plot_cdf({'FCFS': df_fcfs, 'SJF': df_sjf, 'EASY_Backfilling': df_easy, 'GNN_RL': df_gnn_rl, 'Round_Robin': df_rr})
#     plot_radar_comparison(results)

#     print(results)

#     results.to_csv('scheduling_results.csv')

#     print("Scheduling simulation completed. Results saved to 'scheduling_results.csv'.")
#     print("Comparison plot saved as 'comparison_plot.png'.")
#     print("CDF plot saved as 'cdf_plot.png'.")
#     print("GNN training loss plot saved as 'gnn_training_loss.png'.")
#     print("Radar comparison plot saved as 'radar_comparison.png'.")