In [1]:
#! Install library
import os
import importlib.util

if importlib.util.find_spec("requests") is None:
	os.system("pip install requests")

if importlib.util.find_spec("beautifulsoup4") is None:
	os.system("pip install beautifulsoup4")

if importlib.util.find_spec("Counter") is None:
	os.system("pip install Counter")

In [2]:
#! Authentication details
import os
import dotenv
import importlib.util

JiraHost = None
ConfluenceHost = None
Username = None
Password = None

if importlib.util.find_spec("google.colab") is not None: ## if using google colab
    if not os.path.exists('.env'):
        from google.colab import files
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        try:
            os.rename(file_name, '.env')
        except:
            pass

try:
    dotenv.load_dotenv('../.env', override=True)

    JiraHost = os.getenv('SECRETS_HOST')
    ConfluenceHost = os.getenv('SECRETS_CONFLUENCE')
    Username = os.getenv('SECRETS_USERNAME')
    Password = os.getenv('SECRETS_PASSWORD')
except:
    display("trouble loading dot env")
    pass

if JiraHost is None or JiraHost == "":
    JiraHost = input("Enter Jira Host")

if ConfluenceHost is None or ConfluenceHost == "":
    ConfluenceHost = input("Enter Confluence Host")

if Username is None or Username == "":
    Username = input("Enter Username")

if Password is None or Password == "":
    Password = input("Enter Password")

display("Jira Host: " + JiraHost)
display("Confluence Host: " + ConfluenceHost)

'Jira Host: https://jira.budgetdirect.com.au/'

'Confluence Host: https://confluence.budgetdirect.com.au/'

In [3]:
#! Functions
import base64
import re
from bs4 import BeautifulSoup
import requests
from collections import Counter
import pandas as pd
import requests
from functools import reduce
import warnings

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

def _ExpandColumn(self:pd.DataFrame, colName:str, columnsToExpand = [], prefix:str = "Prefix", sentenceCase:bool = True) -> pd.DataFrame:
    if (prefix == "Prefix"):
        prefix = colName + " "
        with warnings.catch_warnings():
          warnings.simplefilter(action='ignore', category=FutureWarning)
          expandedCols = self[colName].apply(lambda x: pd.Series(x).add_prefix(prefix))
        columnsToExpand = [prefix + c for c in columnsToExpand]
    else:
        expandedCols = self[colName].apply(lambda x: pd.Series(x))

    if len(columnsToExpand) > 0:
        expandedCols = expandedCols[columnsToExpand]

    if sentenceCase:
        expandedCols.columns = [fnSentenceCase(c) for c in expandedCols.columns]

    return pd.concat([self.drop(colName, axis=1), expandedCols], axis=1)

pd.DataFrame.expand = _ExpandColumn

def fnSentenceCase(s):
    s = (' '.join(dict.fromkeys(s.split())))  # remove duplicate words
    s = s.replace("0", "") # remove "0"
    s = s.strip()
    return ' '.join([x.capitalize() for x in re.sub(r"([A-Z])", r" \1", s).split()]) # sentence case

def _SentenceCaseColumns(self:pd.DataFrame) -> pd.DataFrame:
    self.columns = [fnSentenceCase(c) for c in self.columns]
    return self

pd.DataFrame.sentence_case_columns = _SentenceCaseColumns

def fnGetDefaultHeaders():
    return {
        "content-type": "application/json",
        "authorization": "Basic " + base64.b64encode((Username + ":" + Password).encode()).decode(),
        "retry-after": "120"
    }

def ApiSpaces(startAt) :
	url = "/rest/api/space"
	headers = fnGetDefaultHeaders()
	params = {
		"type": "global",
		"limit": "50",
		"expand": "permissions",
		"start": str(startAt)
	}
	response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False)
	return response.json()

def ApiSpaceContent(key, startAt) :
	url = "/rest/api/space/" + key + "/content"
	headers = fnGetDefaultHeaders()
	params = {
		"depth": "all",
		"limit": "50",
		"expand": "history.contributors.publishers.users",
		"start": str(startAt)
	}
	response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False)
	return response.json()

def ApiSpaceWatchers(key, startAt) :
	url = "/rest/api/space/" + key + "/watch"
	headers = fnGetDefaultHeaders()
	params = {
		"limit": "50",
		"start": str(startAt)
	}
	response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False)
	return response.json()

def ApiContent(contentId, startAt) :
	url = "/rest/api/content/" + contentId
	headers = fnGetDefaultHeaders()
	params = {
		"limit": "50",
		"start": str(startAt),
		"expand": "history,history.lastUpdated"
	}
	response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False)
	return response.json()

def SpacePageCall(key) :
	url = "/spaces/viewspacesummary.action"
	headers = fnGetDefaultHeaders()
	params = {
		"showAllAdmins": "true",
		"key": key
	}
	response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False)
	return BeautifulSoup(response.content, "html.parser")

def fnAPI(webRequestDelegate, startAt = 0) -> pd.DataFrame:
    def flatten_reduce_lambda(frm):
        try:
            return list(reduce(lambda x, y: x + y, frm, []))
        except:
            return list(reduce(lambda x, y: x + y, [frm], []))
    def innerGetResults(webRequestDelegate, startAt = 0):
        results = webRequestDelegate(startAt)
        if isinstance(results, dict) and "total" in results and "maxResults" in results:
            if startAt + results["maxResults"] < results["total"]:
                return [results] + innerGetResults(webRequestDelegate, startAt + results["maxResults"])
            else:
                return [results]
        else:
            return [results]
    Source = flatten_reduce_lambda(innerGetResults(webRequestDelegate, startAt))
    df = pd.DataFrame(Source)
    return df

def most_frequent(List):
	try:
		c = Counter(List)
		most_common = [key for key, _ in c.most_common(5)]
		return most_common
	except:
		return ""

In [9]:
def fnGetConfluenceSpaceContentViaScan():
    def ApiCall(startAt) :
        url = "/rest/api/content"
        headers = fnGetDefaultHeaders()
        params = {
            "limit": 100,
            "expand": "space,body.view,version,container",
            "type": "page",
            "start": str(startAt)
		}
        response = requests.get(ConfluenceHost + url, headers = headers, params = params, verify=False, allow_redirects=True)
        return response.json()

    startAt = 0
    limit = 100
    request = ApiCall(startAt)
    results = []

    try:
        count = len(request["results"])
    except:
        count = 0

    while count >  0:
        for res in request["results"]:
            results.append(res)
        startAt += limit
        print ("fetching", startAt)
        try:
            request = ApiCall(startAt)
            count = len(request["results"])
        except:
            count = 0

    return results

pages = fnGetConfluenceSpaceContentViaScan()

results = "<html><head><meta name='pageCount' content='" + str(len(pages)) + "' />"
results += "<body>"

i = 0
for page in pages:
    results += "<article class='7ft10-article'><h1 class='7ft10-page-title'>" + page["space"]["key"] + " | " + page["id"] + " | " + page["title"] + "</h1>"
    results += "<section class='7ft10-section'>" + page["body"]["view"]["value"] + "</section>"
    results += "</article>"
    i += 1
    print ("printing", i)

results += "</body></html>"

print ("done")

page = BeautifulSoup(markup=results)

f = open("results.html", "w", encoding="utf-8")
f.write(page.prettify())
f.close

#display (len(results))


fetching 100
fetching 200
fetching 300
fetching 400
fetching 500
printing 1
printing 2
printing 3
printing 4
printing 5
printing 6
printing 7
printing 8
printing 9
printing 10
printing 11
printing 12
printing 13
printing 14
printing 15
printing 16
printing 17
printing 18
printing 19
printing 20
printing 21
printing 22
printing 23
printing 24
printing 25
printing 26
printing 27
printing 28
printing 29
printing 30
printing 31
printing 32
printing 33
printing 34
printing 35
printing 36
printing 37
printing 38
printing 39
printing 40
printing 41
printing 42
printing 43
printing 44
printing 45
printing 46
printing 47
printing 48
printing 49
printing 50
printing 51
printing 52
printing 53
printing 54
printing 55
printing 56
printing 57
printing 58
printing 59
printing 60
printing 61
printing 62
printing 63
printing 64
printing 65
printing 66
printing 67
printing 68
printing 69
printing 70
printing 71
printing 72
printing 73
printing 74
printing 75
printing 76
printing 77
printing 78
printing

<function TextIOWrapper.close()>