# CircleCI Job Analysis Report

## Setup and Configuration

In [None]:
# Setup and configuration
import analysis
import pandas as pd
import matplotlib.pyplot as plt
import os

# Initialize the analysis environment
helpers = analysis.initialize_notebook()
pp = helpers['pp']
summarize_dataset = helpers['summarize_dataset']

# Configuration - can be set via environment variables or modified directly
filepath = os.getenv("FILEPATH", "/tmp/merged.csv")
project_name = os.getenv("PROJECT_NAME", "my-project") 
individual_job_name = os.getenv("JOB_NAME", "deploy")
credit_cost = float(os.getenv("CREDIT_COST", "0.0006"))

# Load data to get organization name for report header
temp_df = pd.read_csv(filepath, escapechar="\\", na_values=["\\N"], nrows=1)
org_name = temp_df['ORGANIZATION_NAME'].iloc[0] if 'ORGANIZATION_NAME' in temp_df.columns else "Unknown Organization"

print("=" * 60)
print("CircleCI Job Analysis Report")
print("=" * 60)
print(f"Organization: {org_name}")
print(f"Project: {project_name}")
print(f"Job: {individual_job_name}")
print(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
print()
print(f"Configuration:")
print(f"  File: {filepath}")
print(f"  Credit Cost: {credit_cost}")


In [None]:
# Load and process data using the analysis library
df, project_dfs = analysis.load_circleci_data(
    filepath=filepath,
    project_name=project_name,
    credit_cost=credit_cost
)

# Extract project-specific datasets
all_jobs = project_dfs['all_jobs']
ps_jobs = project_dfs['ps_jobs']
ps_master_jobs = project_dfs['ps_master_jobs']
ps_pr_jobs = project_dfs['ps_pr_jobs']
ps_pr_passed_jobs = project_dfs['ps_pr_passed_jobs']
ps_pr_failed_jobs = project_dfs['ps_pr_failed_jobs']

# Print dataset summaries
print("Dataset Summary:")
print(summarize_dataset(all_jobs, "All jobs"))
print(summarize_dataset(ps_jobs, "Project-specific jobs"))
print(summarize_dataset(ps_master_jobs, "Master branch jobs"))
print(summarize_dataset(ps_pr_jobs, "PR branch jobs"))
print(summarize_dataset(ps_pr_passed_jobs, "Passed PR jobs"))
print(summarize_dataset(ps_pr_failed_jobs, "Failed PR jobs"))



## Top 10 Jobs Analysis

In [None]:
# Top 10 jobs by longest average duration
job_duration_stats = ps_jobs.groupby("JOB_NAME").agg({
    "JOB_RUN_SECONDS": ["mean", "median", "count"],
    "COST": ["sum", "mean"]
}).round(2)

# Flatten column names
job_duration_stats.columns = ["_".join(col).strip() for col in job_duration_stats.columns]
job_duration_stats = job_duration_stats.reset_index()

# Filter jobs with at least 5 runs for statistical significance
job_duration_stats = job_duration_stats[job_duration_stats["JOB_RUN_SECONDS_count"] >= 5]

# Convert duration to minutes for readability
job_duration_stats["AVG_DURATION_MINUTES"] = job_duration_stats["JOB_RUN_SECONDS_mean"] / 60
job_duration_stats["MEDIAN_DURATION_MINUTES"] = job_duration_stats["JOB_RUN_SECONDS_median"] / 60

# Top 10 by average duration
top_duration_jobs = job_duration_stats.sort_values("JOB_RUN_SECONDS_mean", ascending=False).head(10)
pp(top_duration_jobs[[
    "JOB_NAME", 
    "AVG_DURATION_MINUTES", 
    "MEDIAN_DURATION_MINUTES", 
    "JOB_RUN_SECONDS_count", 
    "COST_sum",
    "COST_mean"
]], "Top 10 Jobs by Longest Average Duration (minutes)")

if len(top_duration_jobs) > 0:
    min_duration = top_duration_jobs['AVG_DURATION_MINUTES'].min()
    print(f"\nüìä Summary: Top {len(top_duration_jobs)} jobs by duration (minimum: {min_duration:.1f} minutes)")
    print(f"üí∞ Combined cost of these top {len(top_duration_jobs)} slowest job types: ${top_duration_jobs['COST_sum'].sum():.2f}")


In [None]:
# Top 10 jobs by total cost
top_cost_jobs = job_duration_stats.sort_values("COST_sum", ascending=False).head(10)
pp(top_cost_jobs[[
    "JOB_NAME", 
    "COST_sum",
    "COST_mean",
    "JOB_RUN_SECONDS_count", 
    "AVG_DURATION_MINUTES", 
    "MEDIAN_DURATION_MINUTES"
]], "Top 10 Jobs by Highest Total Cost")

print(f"\nüí∞ Combined cost of these top 10 most expensive job types: ${top_cost_jobs['COST_sum'].sum():.2f}")
print(f"üìä These jobs represent {(top_cost_jobs['COST_sum'].sum() / ps_jobs['COST'].sum() * 100):.1f}% of total project cost")


## Individual Job Analysis

Analysis of the most expensive and slowest individual job runs.


In [None]:
# Most expensive individual jobs
expensive_jobs = ps_jobs.sort_values("COST", ascending=False)
pp(expensive_jobs[["JOB_NAME", "JOB_RUN_DATE", "VCS_BRANCH", "COST", "DURATION", "COMPUTE_CREDITS", "JOB_URL"]].head(), 
   "Most Expensive Individual Jobs")

# Slowest individual jobs
slow_jobs = ps_jobs.sort_values("JOB_RUN_SECONDS", ascending=False)
pp(slow_jobs[["JOB_ID", "JOB_NAME", "JOB_RUN_DATE", "VCS_BRANCH", "COST", "DURATION", "JOB_URL"]].head(),
   "Slowest Individual Jobs")


## Pipeline Analysis

### Analysis of pipeline costs, frequency, and branch patterns.


In [None]:
# Pipeline cost distribution for PR branches
pr_pipeline_costs = ps_pr_jobs.groupby("PIPELINE_ID").agg(
    COST=("COST", "sum"), 
    NUM_FAILS=('JOB_BUILD_STATUS', lambda x: (x != 'success').sum())
).reset_index()

analysis.plot_cost_distribution(
    pr_pipeline_costs["COST"],
    title="Pipeline cost distribution for PR branches",  
    bins=60
)

print("PR Pipeline Cost Statistics:")
print(pr_pipeline_costs.describe())

# Pipeline cost distribution for master branch
master_pipeline_costs = ps_master_jobs.groupby("PIPELINE_ID").agg({"COST": "sum"}).reset_index()
analysis.plot_cost_distribution(
    master_pipeline_costs["COST"],
    title="Pipeline cost distribution for master branch",  
    bins=60
)

print("\nMaster Pipeline Cost Statistics:")
print(master_pipeline_costs.describe())



## Specific Job Analysis

### Detailed analysis of a specific job type.


In [None]:
# Analysis of specific job
job_pattern = individual_job_name
_df = ps_jobs

# Check available jobs
available_jobs = _df["JOB_NAME"].dropna().unique()
print(f"Available jobs: {list(available_jobs[:10])}...")  # Show first 10

# Filter to specific job
filtered_jobs = _df[_df["JOB_NAME"] == job_pattern]
df_description = f"`{job_pattern}` across all pipelines"

if filtered_jobs.empty:
    print(f"‚ö†Ô∏è  No data found for job '{job_pattern}'. Available jobs: {list(available_jobs[:5])}")
    print("Consider updating the individual_job_name parameter to one of the available jobs.")
else:
    print(f"Found {len(filtered_jobs)} instances of job '{job_pattern}'")
    
    # Duration analysis
    analysis.analyse_durations(
        filtered_jobs["JOB_RUN_SECONDS"],
        title=f"{df_description} duration distribution",
        max_xvalue=30*60,
        bins=20,
    )
    
    # Cost distribution
    analysis.plot_cost_distribution(
        filtered_jobs["COST"],
        title=f"{df_description} cost distribution",
        bins=20,
    )
    
    # Show slowest instances
    _sorted = filtered_jobs.sort_values("JOB_RUN_SECONDS", ascending=False)
    pp(_sorted[["WORKFLOW_ID", "DURATION", "COST", "VCS_BRANCH", "JOB_URL"]].head(10),
       f"Slowest instances of {job_pattern}")
    
    # Show most expensive instances
    _sorted = filtered_jobs.sort_values("COST", ascending=False)
    pp(_sorted[["WORKFLOW_ID", "JOB_RUN_SECONDS", "COST", "JOB_URL"]].head(10),
       f"Most expensive instances of {job_pattern}")

