In [None]:
import re
import pandas as pd

from pymongo import MongoClient

In [4]:
# MongoDB connection setup
CONNECTION_STRING='localhost:27017'

client = MongoClient(CONNECTION_STRING)
swagger = client['swagger']['apis']

---

## Versions

Here we analyze how the APIs have been versioned. In particular, we take a look at the number of APIs with versions and without. Moreover, for the APIs with a versions, we check if that version is semantically correct (to do so we use the semantic versioning regex taken from [semver](https://semver.org/spec/v2.0.0.html#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string)).

In [5]:
versions = pd.DataFrame(list(swagger.aggregate(
    [{ '$group': { '_id': '$_API_spec.info.version', 'count': { '$sum': 1 } } }]
)))

In [None]:
# You can use python-semver package to check if a version is following semver
# https://python-semver.readthedocs.io/en/2.9.0/usage.html

In [None]:
semantic_versioning_regex = r'^(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$'

no_version = versions[(versions['_id'].isna()) | (versions['_id'] == 'None') | (versions['_id'] == 'unversioned')]['count'].sum()
version = versions[(versions['_id'].notna()) & (versions['_id'] != 'None') & (versions['_id'] != 'unversioned')]['count'].sum()
versioned_correctly = versions[versions['_id'].str.contains(semantic_versioning_regex, regex=True, na=False)]['count'].sum()
total_length = versions['count'].sum()

In [7]:
print(f'With versioning: {version} [{str(version / total_length * 100)[:4]}%]')
print(f'Without versioning: {no_version} [{str(no_version / total_length * 100)[:4]}%]\n')
print(f'With semantic versioning:')
print(f'\tW.r.t. versioned: {versioned_correctly} [{str(versioned_correctly * 100 / version)[:4]}%]')
print(f'\tW.r.t. total: {versioned_correctly} [{str(versioned_correctly / total_length * 100)[:4]}%]')

With versioning: 356933 [82.5%]
Without versioning: 75332 [17.4%]

With semantic versioning:
	W.r.t. versioned: 267672 [74.9%]
	W.r.t. total: 267672 [61.9%]


---

## Schemes

Here we analyze how many APIs use one or both schemes (`http`, `https`).

In [17]:
schemes = pd.DataFrame(list(swagger.find({}, { 'name': 1, '_API_spec.schemes': 1 })))

schemes['_API_spec'] = schemes['_API_spec'].apply(lambda el: '' if str(el) == '{}' else ' '.join(re.sub(r'[^\w\s]', '', str(el)).split()[1:]))
schemes['http'] = schemes['_API_spec'].str.contains('http')
schemes['https'] = schemes['_API_spec'].str.contains('https')

In [41]:
has_both = schemes[(schemes['http'] == True) & (schemes['https'] == True)]['_id'].count()
has_none = schemes[(schemes['http'] == False) & (schemes['https'] == False)]['_id'].count()
only_http = schemes[(schemes['http'] == True) & (schemes['https'] == False)]['_id'].count()

total_length = schemes['_id'].count()

In [47]:
print(f'Only `http` schema: {only_http} [{str(only_http / total_length * 100)[:3]}%]')
print(f'Both `http` and `https` schemes: {has_both} [{str(has_both / total_length * 100)[:3]}%]')
print(f'No schema: {has_none} [{str(has_none / total_length * 100)[:4]}%]')

Only `http` schema: 41730 [9.6%]
Both `http` and `https` schemes: 4367 [1.0%]
No schema: 386168 [89.3%]


In [None]:
# swagger.find({}, {'test': { '$size': { '$objectToArray': '$_API_spec.paths' } }})