# Languages
This notebook formats data based around the languages used within the projects. 


In [None]:
from pathlib import Path
import json

# data_file_name = "20240125-1037.json" #  Data set used in report
data_file_name = "20240411-1655.json"
location = Path("../data")

if not location.is_dir():
    print("Directory does not exist, do not continue")
    exit(1)

data_file = Path(location, data_file_name)
if not data_file.is_file():
    print("File does not exist, do not continue")
    exit(1)
    
with open(data_file) as df:
    data = json.load(df)

# List languages
The list is all the languages used across all projects over the lifetime of the projects with duplicates removed. The `scc` tool tries to detect auto generated code and will list the language as generated.


In [None]:
from cyclomatic_complexity import list_to_columns
lang = set()
for project in data:
    for entry in data[project]:
        for scc in entry['scc']:
            lang.add(scc['Name'])

print(f"Languages: {len(lang)}")
print(list_to_columns(sorted(lang), cols=5))

# Mapping dates across projects


In [None]:
largest = {"project": '', 'merges': 0}
for project in data:
    if len(data[project]) > largest['merges']:
        largest['project'] = project
        largest['merges'] = len(data[project])

print(largest)

In [None]:
from tabulate import tabulate
from pprint import pprint
from datetime import datetime
timestamp_format = "%a %b %d %H:%M:%S %Y %z"
# for merge in data[largest['project']]:
#     t = merge['timestamp']
#     d = datetime.strptime(t, timestamp_format)
#     print(d)

history_dates = []
for project in data:
    if not len(data[project]):
        continue
    project_history = {"project": project, "first_merge": datetime.utcnow(), "last_merge": datetime(2000, 1, 1)}
    for merge in data[project]:
        timestamp = datetime.strptime(merge['timestamp'], timestamp_format)
        
        if timestamp.timestamp() < project_history['first_merge'].timestamp():
            project_history['first_merge'] = timestamp
        
        if timestamp.timestamp() > project_history['last_merge'].timestamp():
            project_history['last_merge'] = timestamp
    
    history_dates.append(project_history)

history_dates = sorted(history_dates, key=lambda x: x['first_merge'])
headers = {"project": 'Project', "first_merge": 'First Merge', "last_merge": 'Last Merge'}
table = tabulate(history_dates, headers)

print(table)

In [None]:
testing = False

def xx(dataset):
    youngest = {'project': None, 'timestamp': datetime(2000, 1, 1)}
    project_youngest = {}
    for project in dataset:
        if project not in project_youngest:
            project_youngest.setdefault(project, {'commit': None, 'timestamp': datetime(2000, 1, 1)})
        for entry in dataset[project]:
            if entry['timestamp'].timestamp() > youngest['timestamp'].timestamp():
                youngest['project'] = project
                youngest['timestamp'] = entry['timestamp']
            if entry['timestamp'].timestamp() > project_youngest[project]['timestamp'].timestamp():
                project_youngest[project] = entry
    if youngest['project'] is not None:
        dataset[youngest['project']].remove(project_youngest[youngest['project']])
    return youngest, project_youngest, dataset
    
def get_row_set():
    if testing:
        sample = {
            'P1': [
                {"commit": 'A', 'timestamp': datetime(2000, 1, 9)},
                {"commit": 'B', 'timestamp': datetime(2000, 1, 5)},
                {"commit": 'C', 'timestamp': datetime(2000, 1, 4)},
            ],
            'P2': [
                {"commit": '1', 'timestamp': datetime(2000, 1, 8)},
                {"commit": '2', 'timestamp': datetime(2000, 1, 7)},
                {"commit": '3', 'timestamp': datetime(2000, 1, 6)},
            ]
        }
    else:
        sample = {}
        for project in data:
            if not len(data[project]):
                continue
            if project not in sample:
                sample.setdefault(project, [])
            for entry in data[project]:
                sample[project].append({'commit': entry['commit'], 'timestamp': datetime.strptime(entry['timestamp'], timestamp_format)})

    for project in sample:
        sample[project] = sorted(sample[project], key=lambda x: x['timestamp'])

    counter = 1
    rows = []

    while counter:
        counter = 0
        for project in sample:
            if len(sample[project]) > counter:
                counter = len(sample[project])

        youngest, project_youngest, sample = xx(sample)
        
        if youngest['project'] is None:
            continue

        row = {'timestamp': youngest['timestamp']}
        for project in project_youngest:
            row[project] = project_youngest[project]['commit']
        rows.append(row)
    return rows

# Lines of Code


In [None]:
import matplotlib.pyplot as plt

def scc_total(scc):
    total = 0
    for entry in scc:
        total += entry['Lines']
    return total

loc_rows = get_row_set()

for row in loc_rows:
    for key in row:
        if key == 'timestamp' or row[key] is None:
            continue

        commit = filter(lambda x: x['commit'] == row[key], data[key])
        commit = next(commit, None)
        if commit:
            row[key] = scc_total(commit['scc'])
            
timestamps = []
projects = {}
for row in loc_rows:
    for key in row:
        if key == 'timestamp':
            timestamps.append(row[key])
            continue
        if key not in projects:
            projects.setdefault(key, [])
        value = row[key]
        if value is None:
            value = 0
        projects[key].append(value)
        
fig, ax = plt.subplots(figsize=(40, 15))

ax.stackplot(timestamps, projects.values(),
             labels= projects.keys(), alpha=0.8)
ax.legend(loc='upper left', reverse=True)
ax.set_title('LoC for Org')
ax.set_xlabel('Year')
ax.set_ylabel('Lines Of Code')

plt.show()

In [1]:
import matplotlib.pyplot as plt

def scc_was_scanned(source, filters):
    not_scanned = 0
    scanned = 0
    
    for s in source:
        if s['Name'].lower() in filters:
            scanned += s['Lines']
        else:
            not_scanned += s['Lines']
    
    return scanned, not_scanned

cc_lang = ['rust', 'go', 'python', 'javascript', 'ruby']
loc_cc = get_row_set()

results = []
for row in loc_cc:
    result_row = {'timestamp': '', 'scanned': 0, 'not_scanned': 0}
    for key in row:
        if key == 'timestamp':
            result_row['timestamp'] = row[key]
            continue
        if row[key] is None:
            continue

        commit = filter(lambda x: x['commit'] == row[key], data[key])
        commit = next(commit, None)
        if commit:
            scanned, not_scanned = scc_was_scanned(commit['scc'], cc_lang)
            result_row['scanned'] += scanned
            result_row['not_scanned'] += not_scanned
    results.append(result_row)

timestamps = []
status = {}
for row in results:
    for key in row:
        if key == 'timestamp':
            timestamps.append(row[key])
            continue
        if key not in status:
            status.setdefault(key, [])
        value = row[key]
        if value is None:
            value = 0
        status[key].append(value)

fig, ax = plt.subplots(figsize=(40, 15))

ax.stackplot(timestamps, status.values(),
             labels= status.keys(), alpha=0.8)
ax.legend(loc='upper left', reverse=True)
ax.set_title('Scanned lines of code Vs Non Scanned for Cyclomatic Complexity')
ax.set_xlabel('Year')
ax.set_ylabel('Lines Of Code')

plt.show()



NameError: name 'get_row_set' is not defined