# Get Resources
This project requires git repos to function.


In [1]:
import requests
from pathlib import Path

test = False

org = "kuadrant"
location = Path("/tmp/cyclomatic_complexity")
test_repos = [
    "git@github.com:Kuadrant/kuadrant-operator.git",
    "git@github.com:Kuadrant/authorino.git",
    "git@github.com:Kuadrant/testsuite.git"
]

repos = []

if test: 
    repos = test_repos
else:
    url = f"https://api.github.com/orgs/{org}/repos"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"data fetch failed: {resp.status_code}")
        exit(1)
    repositories = resp.json()
    for repo in repositories:
        if repo['private']:
            print(f"Private Repo: {repo['name']}")
            continue
        if repo['fork']:
            print(f"Forked Repo: {repo['name']}")
            continue
        if repo['archived']:
            print(f"Archived Repo: {repo['name']}")
            continue
        repos.append(repo['ssh_url'])
    
    
print(repos)

location.mkdir(parents=True, exist_ok=True)

Archived Repo: kuadrant-controller
Forked Repo: kcp
Forked Repo: wasm-test-framework
Forked Repo: istio
['git@github.com:Kuadrant/limitador.git', 'git@github.com:Kuadrant/limitador-operator.git', 'git@github.com:Kuadrant/authorino.git', 'git@github.com:Kuadrant/infinispan-rs.git', 'git@github.com:Kuadrant/kuadrantctl.git', 'git@github.com:Kuadrant/kuadrant.git', 'git@github.com:Kuadrant/petstore-service-sample.git', 'git@github.com:Kuadrant/.github.git', 'git@github.com:Kuadrant/authorino-operator.git', 'git@github.com:Kuadrant/architecture.git', 'git@github.com:Kuadrant/authorino-examples.git', 'git@github.com:Kuadrant/wasm-shim.git', 'git@github.com:Kuadrant/kuadrant-operator.git', 'git@github.com:Kuadrant/testsuite.git', 'git@github.com:Kuadrant/kuadrant.github.io.git', 'git@github.com:Kuadrant/multicluster-gateway-controller.git', 'git@github.com:Kuadrant/multicluster-gateway-controller-addon.git', 'git@github.com:Kuadrant/kuadra.git', 'git@github.com:Kuadrant/gateway-api-machinery

In [2]:
from time import perf_counter
start = perf_counter()
for repo in repos:
    !git -C {location} clone --single-branch {repo}
end = perf_counter()
print(f"Repo cloning took {int(end-start)} seconds")

Cloning into 'limitador'...
remote: Enumerating objects: 7706, done.[K
remote: Counting objects: 100% (1314/1314), done.[K
remote: Compressing objects: 100% (389/389), done.[K
remote: Total 7706 (delta 1101), reused 1055 (delta 925), pack-reused 6392[K
Receiving objects: 100% (7706/7706), 4.74 MiB | 6.77 MiB/s, done.
Resolving deltas: 100% (4214/4214), done.
Cloning into 'limitador-operator'...
remote: Enumerating objects: 2443, done.[K
remote: Counting objects: 100% (958/958), done.[K
remote: Compressing objects: 100% (356/356), done.[K
remote: Total 2443 (delta 656), reused 812 (delta 587), pack-reused 1485[K
Receiving objects: 100% (2443/2443), 21.26 MiB | 10.47 MiB/s, done.
Resolving deltas: 100% (1457/1457), done.
Cloning into 'authorino'...
remote: Enumerating objects: 17792, done.[K
remote: Counting objects: 100% (1962/1962), done.[K
remote: Compressing objects: 100% (694/694), done.[K
remote: Total 17792 (delta 1301), reused 1850 (delta 1236), pack-

# Baseline data
As we want to look at the data over time, we need some method to slice the data up.
Release will not work as some projects do not follow a release cycle.
This is way the merge commits will be markers that is used set the points in time.


In [3]:
import os
from cyclomatic_complexity import list_to_columns
projects = None
for _, dirs, _ in os.walk(location):
    projects = dirs
    break
print(f"Projects: {len(projects)}")
print(list_to_columns(sorted(projects)))

Projects: 26
.github              		gateway-api-state-metrics		limitador-operator                   		
api-petstore         		governance               		logos                                		
api-quickstart       		infinispan-rs            		multicluster-gateway-controller      		
architecture         		kuadra                   		multicluster-gateway-controller-addon		
authorino            		kuadrant                 		petstore-service-sample              		
authorino-examples   		kuadrant-operator        		testsuite                            		
authorino-operator   		kuadrant.github.io       		testsuite-pipelines                  		
docs.kuadrant.io     		kuadrantctl              		wasm-shim                            		
gateway-api-machinery		limitador                		


In [4]:
from datetime import datetime
data = {}
linfo = []
total = 0
for project in projects:
    if project not in data:
        data.setdefault(project, [])
    merge_log = !git -C {Path(location, project)} log --merges --pretty=format:'%H ||| %ad'
    last_merge = None
    for log in merge_log:
        log = log.split("|||")
        data[project].append({"commit": log[0].strip(), "timestamp": log[1].strip()})
        if last_merge is None:
            last_merge = log[0].strip()


    commit_log = []
    if last_merge is not None:
        last_merge_time = !git -C {Path(location, project)} show -s --format='%at' {last_merge}
        last_merge_time = last_merge_time[0]
        commit_log = !git -C {Path(location, project)} log --author-date-order --all --after={last_merge_time} --pretty=format:'%H ||| %ad'    

    for i in range(len(commit_log) - 1):
        log = commit_log[i]
        log = log.split("|||")
        data[project].append({"commit": log[0].strip(), "timestamp": log[1].strip()})

    data[project] = sorted(data[project], key=lambda t: datetime.strptime(t['timestamp'], "%a %b %d %H:%M:%S %Y %z"))

    total += len(data[project])
    linfo.append(f"{project}: {len(data[project])}")

print(f"Merges: {total}")
print(list_to_columns(sorted(linfo)))        

Merges: 2057
.github: 3              		gateway-api-state-metrics: 22		limitador: 225                          		
api-petstore: 0         		governance: 1                		logos: 0                                		
api-quickstart: 9       		infinispan-rs: 23            		multicluster-gateway-controller-addon: 0		
architecture: 29        		kuadra: 18                   		multicluster-gateway-controller: 321    		
authorino-examples: 34  		kuadrant-operator: 384       		petstore-service-sample: 0              		
authorino-operator: 114 		kuadrant.github.io: 43       		testsuite-pipelines: 2                  		
authorino: 302          		kuadrant: 0                  		testsuite: 264                          		
docs.kuadrant.io: 87    		kuadrantctl: 50              		wasm-shim: 29                           		
gateway-api-machinery: 1		limitador-operator: 96       		


# Tools

In [5]:
exclude_dir = {
    "kuadrant.github.io": [
        Path('static', 'vendor'),
        Path('_site', 'static'),
    ],
    "wasm-shim": [
        Path('src', 'envoy_ext_auth'), # Seems to be only in the initial merges and trows graphs
    ],
    "kuadrant-operator": [
        Path('vendor'),
    ],
    "authorino": [
        Path('vendor'),
    ]
}

## LOC Lines Of Code
As we are looking at the projects over time we need to know how much the projects have grown. For this we will use the lines of Code, LOC, as the measurement tool.

In [6]:
import json
def get_loc(scan_path, exclude: list[Path]):
    dirs = ''
    if exclude:
        dirs = []
        for e in exclude:
            path = Path(scan_path, e)
            if path.is_dir():
                dirs.append(str(path))
        dirs = ','.join(dirs)        
        
    out = !scc -z --format=json --exclude-dir={dirs} {scan_path} 
    return json.loads(out[0])

## Cyclomatic Complexity
Each language has their own sets of tools to get the score. Bellow is the list of tools used for the different languages.

* go --> [gocyclo](https://github.com/fzipp/gocyclo)
* rust --> [rust-code-analysis-cli](https://github.com/mozilla/rust-code-analysis)
* python --> [rust-code-analysis-cli](https://github.com/mozilla/rust-code-analysis)
* JavaScript --> [rust-code-analysis-cli](https://github.com/mozilla/rust-code-analysis)
* Ruby --> [RuboCop](https://rubocop.org/)

Result structure
```pythoN
cc = {
    'sorce': 1,
    'function': 'function_name',
    'file': 'path/to/file'
}
```

In [7]:
from pprint import pprint


def unknown(root, exclude: list[Path]):
    """Default function for language without a tool defined."""
    return None

def gocyclo(root, exclude: list[Path]):
    """
    ['1', 'v1beta1', '(*RateLimitPolicy).DeepCopyInto', '/tmp/cyclomatic_complexity/kuadrant-operator/api/v1beta1/zz_generated.deepcopy.go:502:1']
    """
    print("Getting Cyclomatic Complexity values for Go files")
    out =  []
    result = !gocyclo {root} 
    for row in result:
        row = row.split()
        tmp = {'score': int(row[0]), 'function': row[2], 'file': row[3].split(':')[0]}
        if 'zz_generated.deepcopy.go' in tmp['file']:
            continue
        out.append(tmp)
        
    return out
   
def rust_code_analysis_cli(root, exclude: list[Path]):
    print("Getting Cyclomatic Complexity with rust-code-analysis-cli")
    dirs = ''
    if exclude:
        dirs = []
        for e in exclude:
            path = Path(root, e)
            if path.is_dir():
                dirs.append(str(Path(path, '**')))
        dirs = ' '.join(["--exclude " + p for p in dirs])
    d = !rust-code-analysis-cli --paths {root} --metrics --output-format json {dirs}
    
    files = []
    for i in d:
        i = json.loads(i)
        files.append(i)
    out = []
    def get_data(object):
        for space in object['spaces']:
            if space['kind'] == "function":
                out.append({'file': file_name, 'function': space['name'], 'score': space['metrics']['cyclomatic']['sum']})
            else:
                get_data(space)
    for f in files:
        file_name = f['name']
        get_data(f)
    return out

def rubocop(root, exclude: list[Path]):
    print("Running RuboCop")
    p = Path(root)
    parent = p.parent
    config_file = Path(parent, ".rubocop.yml")
    if not config_file.exists():
        config = """# The behavior of RuboCop can be controlled via the .rubocop.yml
# configuration file. It makes it possible to enable/disable
# certain cops (checks) and to alter their behavior if they accept
# any parameters. The file can be placed either in your home
# directory or in some project directory.
#
# RuboCop will start looking for the configuration file in the directory
# where the inspected file is and continue its way up to the root directory.
#
# See https://docs.rubocop.org/rubocop/configuration

AllCops:
  DisabledByDefault: true

Metrics/CyclomaticComplexity:
  Enabled: true
  Max: 0       
        """
        config_file.write_text(config)
        
    result = !rubocop -c {config_file} --format=json {root}
    
    result = result[0]
    result = json.loads(result)
    out = []
    for f in result['files']:
        file_name = f['path']
        for offense in f['offenses']:
            score = offense['message'].split('[')[1].split('/')[0]
            try:
                score = int(score)
            except ValueError:
                print(f"\n\n{root=}\n{score=}\n\n")
                continue
            name = f"start_line:{offense['location']['start_line']}"
            out.append({'file': file_name, 'function': name, 'score': score})
    
    return out
    
def get_cc_tool(langauge: str):
    cc = {
        'go': gocyclo,
        'rust': rust_code_analysis_cli,
        'python': rust_code_analysis_cli,
        'javascript': rust_code_analysis_cli,
        'ruby': rubocop,
        'default': unknown
    }
    
    return cc.get(langauge.lower(), unknown)



# Per commit actions
Sadly we need to check out each commit in the merge history in order to be able to run the required data collections. 

This will take some time to run.

In [8]:
from time import perf_counter
import json
# guide: takes about .6 seconds per commit

def can_scan(langauge: str):
    disallowed = ("(gen)", "(min)")
    for block in disallowed:
        if block in langauge:
            return False
    return True

start = perf_counter()
for project in projects:
    !git -C {ploc} stash
    ploc = Path(location, project)
    current_HEAD = !git -C {ploc} rev-parse --abbrev-ref HEAD
    current_HEAD = current_HEAD[0]
    print(f"{project}: {current_HEAD=}")
    
    for entry in data[project]:

        !git -C {ploc} checkout {entry['commit']}
        print("Getting lines of Code data")
        exclude = exclude_dir.get(project, [])
        
        entry['scc'] = get_loc(ploc, exclude)
        entry.setdefault('cc', [])
        for lang in  entry['scc']:
            if not can_scan(lang['Name']):
                continue
            action = get_cc_tool(lang['Name'])
            result = action(ploc, exclude)
            if result is None:
                continue
            entry['cc'] += result

        # insure entries are only counter once
        tmp_set = {json.dumps(i) for i in entry['cc']}
        entry['cc'] = [json.loads(i) for i in tmp_set]
        
        !git -C {ploc} stash
    !git -C {ploc} checkout {current_HEAD}    
end = perf_counter()

print(f"Code Analysis took {int(end - start)} seconds")

fatal: cannot change to '{ploc}': No such file or directory
limitador: current_HEAD='main'
Note: switching to 'c7fb6f64ae2cb70940f734f1ca91001425bc9536'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at c7fb6f6 Merge branch 'initial-version'
Getting lines of Code data
Getting Cyclomatic Complexity with rust-code-analysis-cli
No local changes to save
Previous HEAD position was c7fb6f6 Merge branch 'initial-version'
HEAD is now at 8050cba Merge branch 'add-storage-check-and-update'
Getting li


# Save data
The data is saved for analysis later

In [9]:
import json
from datetime import datetime

data_route = Path("../data")
data_route.mkdir(parents=True, exist_ok=True)
data_file = Path(data_route, f'{datetime.now().strftime("%Y%m%d-%H%M")}.json')
with open(data_file, 'w') as outfile:
    outfile.write(json.dumps(data, indent=4))
    print(f"Data File: {data_file}")

Data File: ../data/20240513-1840.json
