# Analyzation of Workflows of public Github Repos

In [None]:
import sys
import os

# Add the parent directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../../..')))

from util.github import get_workflows, get_workflow_runs, get_commit, get_rate_limit

rate_limit = get_rate_limit()

# Core API rate limit
core = rate_limit['resources']['core']
print(f"Limit: {core['limit']} requests/hour")
print(f"Remaining: {core['remaining']}")

owner = "tensorflow"
repository = "tensorflow"
workflow_name = "CodeQL"

workflows = get_workflows(owner, repository)

print(f"Total workflows: {len(workflows)}")

workflow = next((wf for wf in workflows if wf.name == workflow_name), None)

if (workflow is None):
    print(f"Workflow '{workflow_name}' not found in repository '{owner}/{repository}'")
else:
    print(f"Found workflow '{workflow.name}' with ID {workflow.id}")
    workflow_id = workflow.id

workflow

In [None]:
runs = get_workflow_runs(owner, repository, workflow_id, fetch_all=True, status="completed", branch="master")

print(f"# Runs: {len(runs)}")

# Filter runs on main branch
main_branch_runs = [run for run in runs if run.conclusion == "success"]

print(f"Total runs: {len(runs)}")
print(f"Main branch runs: {len(main_branch_runs)}")

In [None]:
# Get commit info for each run to show files and lines changed
import pandas as pd
from dataclasses import asdict

run_data = []
for i, run in enumerate(main_branch_runs):  # Start with first 20 to avoid too many API calls
    commit = get_commit(owner, repository, run.head_sha)
    
    run_data.append({
        'run_number': run.run_number,
        'duration_seconds': run.duration_seconds if run.duration_seconds else None,
        'conclusion': run.conclusion,
        'commit_sha': commit.short_sha,
        'commit_message': commit.short_message,
        'files_changed': commit.file_count,
        'lines_added': commit.total_additions,
        'lines_deleted': commit.total_deletions,
        'total_changes': commit.total_changes,
        'config_file_changed': any(f.filename == workflow.path for f in commit.files),
        'created_at': run.created_at,
    })
    
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/{len(main_branch_runs)} runs...")

df = pd.DataFrame(run_data)
print(f"\nProcessed {len(df)} runs on main branch")
df


In [None]:
count = len([run for run in runs if run.duration_seconds and run.duration_seconds > 600])
print(f"Number of runs with duration > 600 seconds: {count}")

In [None]:
# Summary statistics
print("Summary Statistics:")
print(f"Average duration: {df['duration_seconds'].mean():.2f} seconds")
print(f"Average files changed: {df['files_changed'].mean():.1f}")
print(f"Average lines added: {df['lines_added'].mean():.1f}")
print(f"Average lines deleted: {df['lines_deleted'].mean():.1f}")
print(f"Success rate: {(df['conclusion'] == 'success').sum() / len(df) * 100:.1f}%")


rate_limit = get_rate_limit()

# Core API rate limit
core = rate_limit['resources']['core']
print(f"Limit: {core['limit']} requests/hour")
print(f"Remaining: {core['remaining']}")

In [None]:
import matplotlib.pyplot as plt

df = df[df['duration_seconds'] <= 600]
df = df[df['total_changes'] <= 2000]
df = df.sort_values('created_at')
df['created_at'] = pd.to_datetime(df['created_at'])

fig, ax1 = plt.subplots(figsize=(24, 6))

# Plot stacked bar chart for lines added and deleted
ax1.bar(df['created_at'], df['lines_deleted'], label='Lines Deleted', color='red', alpha=0.6)
ax1.bar(df['created_at'], df['lines_added'], bottom=df['lines_deleted'], label='Lines Added', color='green', alpha=0.6)
ax1.set_xlabel('Time')
ax1.set_ylabel('Lines Changed', color='black')
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')

# Create second y-axis for duration
ax2 = ax1.twinx()
colors = ['#E52B50' if changed else '#2233CC' for changed in df['config_file_changed']]
ax2.plot(df['created_at'], df['duration_seconds'], color='gray', linewidth=1, alpha=0.3)
ax2.scatter(df['created_at'], df['duration_seconds'], c=colors, marker='o', label='Duration', s=30, zorder=5)
ax2.set_ylabel('Duration (seconds)', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
ax2.legend(loc='upper right')

plt.title('Workflow Duration and Code Changes Over Time')
plt.show()

In [None]:
df_last = df[df['created_at'] >= df['created_at'].max() - pd.Timedelta(days=150)]

fig, ax1 = plt.subplots(figsize=(24, 6))

# Plot stacked bar chart for lines added and deleted
ax1.bar(df_last['created_at'], df_last['lines_deleted'], label='Lines Deleted', color='red', alpha=0.6)
ax1.bar(df_last['created_at'], df_last['lines_added'], bottom=df_last['lines_deleted'], label='Lines Added', color='green', alpha=0.6)
ax1.set_xlabel('Time')
ax1.set_ylabel('Lines Changed', color='black')
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')

# Create second y-axis for duration
ax2 = ax1.twinx()
colors = ['purple' if changed else 'blue' for changed in df_last['config_file_changed']]
ax2.plot(df_last['created_at'], df_last['duration_seconds'], color='gray', linewidth=1, alpha=0.3)
ax2.scatter(df_last['created_at'], df_last['duration_seconds'], c=colors, marker='o', label='Duration', s=30, zorder=5)
ax2.set_ylabel('Duration (seconds)', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
ax2.legend(loc='upper right')

plt.title('Workflow Duration and Code Changes Over Time')
plt.show()