In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

data = pd.read_csv("../Data/Slurm-data-2.csv")
data.columns = ['UID','GID','JobIDRaw','Group','Account','JobName','TimelimitRaw','Submit','Start','End','State','ExitCode',
                'ReservationId','Reservation','Priority','Eligible','Constraints','SystemCPU','CPUTimeRAW','ElapsedRaw',
                'Layout','NTasks','QOSREQ','QOS','Restarts','WorkDir','ConsumedEnergyRaw','FailedNode','AveDiskRead',
                'AveDiskWrite','MaxDiskRead','MaxDiskWrite','Partition','Reason','Suspended','AllocNodes','AveRSS','MaxRSS',
                'DerivedExitCode','AveVMSize','MaxVMSize','ReqMem','ReqNodes','NNodes','Planned','PlannedCPURAW','NCPUS',
                'UserCPU','ReqCPUS','TotalCPU','TRESUsageInTot','TRESUsageOutTot','ReqTRES','AllocTRES','TRESUsageInMax',
                'TRESUsageOutMax','Flags','Comment','SystemComment','AdminComment']

data = data.dropna(subset=['JobIDRaw', 'Submit', 'Start'])

#Verify no missing values in JobIDRaw, Submit, Start
#print(data.shape)
#nan_columns = data.columns[data.isna().any()]
#print(data[nan_columns])

#Removes Na values in JobIDRaw, Submit, Start
data = data.sort_values(by=['JobIDRaw'], ascending=True)

#Errors=coerce removes invalid dates that are not NaNs
data['Start'] = pd.to_datetime(data["Start"], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
data['Start'] = (data["Start"] - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
data['Submit'] = pd.to_datetime(data['Submit'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
data['Submit'] = (data['Submit'] - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

wait_times = []

for i in range(len(data)):
    wait = data.iloc[i]['Start'] - data.iloc[i]['Submit']
    wait_times.append(wait)

plot = px.line(data, x=data['JobIDRaw'], y=wait_times, color=data['State'],
                    title='Wait Time vs JobID',
                    labels={'x': 'JobID Time', 'y': 'Wait Time (seconds)'},
                    color_discrete_map={'timeout': 'black'})

#Can add lots of color,opacity, size, shape info to the datapoints
plot.update_traces(mode='markers+lines', marker=dict(size=6))
plot.update_layout(xaxis=dict(range=[2960, 3165]))
plot.update_layout(yaxis=dict(range=[-1, 55]))
plot.show()

plot.write_html("../Plots/Slurm-Data-2-plot.html")
plot.write_image("../Plots/WaitTimevsJobID.png")

#Also, can click and drag on the plot to zoom in on a specific area of the graph

In [2]:
#Task 5

uid_job_counts = {}

for uid in data['UID']:
    if uid in uid_job_counts:
        uid_job_counts[uid] += 1
    else:
        uid_job_counts[uid] = 1

data['NormalizedState'] = data['State'].str.split().str[0]

states = []
for state in data['NormalizedState']:
    if pd.notna(state) and state not in states:
        states.append(state)

grouped = data.value_counts(['UID', 'NormalizedState']).reset_index(name="Count")

#This makes the x axis categorical instead of continuous, which makes it readable
#Without this line, bars are too skinny to see
grouped['UID'] = grouped['UID'].astype(str)

bar = px.bar(grouped, x='UID', y='Count', color='NormalizedState',
            title='Jobs Submitted per User',
            labels={'Count': 'Number of Jobs Submitted', 'UID': 'User ID', 'NormalizedState': 'Job State'},
            category_orders={'NormalizedState': states})
bar.show()

bar.write_html("../Plots/Jobs_Submitted_per_User.html")
bar.write_image("../Plots/Jobs_Submitted_per_User.png")

## Task 6

### Gemini API

In [None]:
#gemini-2.5-flash is free
#pro/ultra models cost money
#Those models will probably preform better
import google.generativeai as genai
from google.genai import types
from google import genai
import csv
import os

google_api_key = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=google_api_key)

# Upload the first image
image1_path = "../Plots/Jobs_Submitted_per_user.png"
uploaded_file = client.files.upload(file=image1_path)
response_md = "Gemini-analysis_output.md"

# Prepare the second image as inline data
image2_path = "../Plots/WaitTimevsJobID.png"
with open(image2_path, 'rb') as f:
    img2_bytes = f.read()

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[
        "Act as a data scientist to summarize the charts and provide a quantitative analysis of the key trends, relationships, and statistics of the 2 provided charts. Be specific and mention any notable patterns or outliers. Calculate meaningful statistics from the plots.",
        uploaded_file,
        types.Part.from_bytes(
            data=img2_bytes,
            mime_type='image/png'
        )
    ]
)

with open(response_md, 'w', newline='', encoding='utf-8') as mdfile:
    mdfile.write("# Analysis\n")
    mdfile.write(response.text)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



### OpenAI

In [None]:
#OpenAI API test
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

completion = client.chat.completions.create(
model="gpt-4o-mini",
store=True,
messages=[
    {"role": "user", "content": "write a haiku about ai"}
]
)

print(completion.choices[0].message)

#Error, requires payment


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

### Claude

In [None]:
# Requires payment
# Setup tutorial: https://tilburg.ai/2025/01/beginners-tutorial-for-the-claude-api-python/

### Llama

In [None]:
#Free, but there is a waitlist to get an api key
#Might be able to do it by creating account with hugging face or kaggle

### NVIDIA

In [None]:
#Only one where use case example was analyzing graphs
#Not sure how to get it, maybe hugging face

In [None]:
#Gemini analysis from text only descriptions of plots, not actual images

data = "../Data/Slurm-data-2.csv"
bar_chart = "../Plots/Jobs_Submitted_per_User.html"
response_md = "Gemini-analysis_output.md"

df = pd.read_csv(data)
data_string = df.to_string()

#Job submitted per user
bar_chart_description = {
    "x_axis": "User ID",
    "y_axis": "Number of Jobs Submitted",
    "bar_segments": "State (completed, cancelled, failed, timeout, node_fail)",
    "description": "Each bar represents a user, the total height of the bar shows the total number of jobs submitted by that user, and the segments within the bar show the count of jobs in each state for that user. Failed is blue, Completed is red, Cancelled is green, Node_fail is purple, timeout is orange."
}

#Wait time bs Job ID
line_chart_description = {
    "x_axis": "Job ID",
    "y_axis": "Wait time",
    "data_point": "Wait time for a specific job ID",
    "description": "The plot displays a series of connected points (and lines) where the horizontal position represents the Job ID (specifically focused between 2960 and 3165), the vertical position indicates the wait time in seconds (ranging from -1 to 55). It visualizes how wait times fluctuate across a specific range of submitted jobs, differentiated by their final status."
}

prompt = f"Act as a data scientist to analyze the following CSV data and provide a quantitative summary of the key trends, relationships, and statistics. Be specific and mention any notable patterns or outliers. Once you have summarized the data as a whole, focus in the relationship between the UID, count, and State columns. There is a bar chart that plots User ID along the x axis, number of jobs submitted on the y axis, and each bar is seperated by number of jobs in each state so that you can see total number of jobs submitted per user, along with how many jobs per user were finished, cancelled, failed, etc. Provide a quantitative analyis of this plot and highlight patterns, trends, and outliers. Preform appropriate calculations to show patterns numerically. Do the same operation with the third plot provided, which is a line plot representing wait time for each job \n\nData:\n{data_string}\n\nBar Chart Description:\n{bar_chart_description}\n\nLine Chart Description:\n{line_chart_description}"

response = client.models.generate_content(model='gemini-2.0-flash', contents=prompt)

with open(response_md, 'w', newline='', encoding='utf-8') as mdfile:
    mdfile.write("# Analysis\n")
    mdfile.write(response.text)



