# Keyword Visualisation

### Usage:
* Save the registration sheet as a .tsv file
* Run this Jupyter Notebook

### Dependencies

 * WordCloud: ```pip3 install --user wordcloud```



### Load the TSV file


In [1]:
def standardise_kw(k):
	k = k.replace("4", " for ")
	k = k.replace("domain specific", "domain-specific")
	k = k.replace("langauge", "language")
	return k


member_keywords = {}
member_affil = {}
with open("reg_form.tsv") as f:

	for line in f.readlines()[2:]:

		s = line.split("\t")
		name = s[0] + " " + s[1]
		print(name)

		if not name.strip():
			continue

		affil = s[2]
		member_affil[name] = affil

		keywords = []
		for keyword in s[6].split(","):
			keyword = keyword.strip().lower()
			if not ";" in keyword:
				keywords.append(standardise_kw(keyword))
			else:
				kws = keyword.split(";")
				for kw in kws:
					keywords.append(standardise_kw(kw))

		print(keywords)

		member_keywords[name] = keywords

Maximilian Schiedermeier
['planned model reuse', 'model composition and transformation', 'dsmls', 'restful services']
Mathieu Nassif
['software maintenance', ' software documentation', ' knowledge extraction and management']
Bentley Oakes
['model transformation', 'cyber-physical system verification', 'machine learning', 'model-driven engineering']
Michalis Famelis
['mde', 'variability', 'uncertainty', 'rationale']
Mouna  Dhaouadi
['rationale knowledge extraction and management']
Armstrong Foundjem 
['certifiability of ml algorithms', 'safety on mission-critical systems', 'mlops', 'software ecosystem sustainability']
Martin Robillard
['human-centric aspects of software engineering', 'recommendation systems for software engineering', 'software traceability', 'documentation generation', 'software architecture and design', 'privacy engineering']
Houari  Sahraoui
['sbse', 'ai for se', 'mde']
Breandan Considine
['developer tools', 'software language engineering', 'domain-specific languages']

### Do wordcloud

In [2]:
from wordcloud import WordCloud

collapsed_keywords = []
for ks in member_keywords.values():
	for k in ks:
		collapsed_keywords.append(k)

print(collapsed_keywords)

freqs = {}
for ck in collapsed_keywords:
	freqs[ck] = collapsed_keywords.count(ck)
freqs = dict(sorted(freqs.items(), key=lambda item: item[1]))
print(freqs)

# Generate a word cloud image
wordcloud = WordCloud(background_color ='white', width=2048, height=1600, relative_scaling=0.8)\

out = wordcloud.generate_from_frequencies(freqs)
out.to_file("phrases.png")

out = wordcloud.generate(" ".join(collapsed_keywords))
out.to_file("words.png")


# Display the generated image:
# the matplotlib way:
# import matplotlib.pyplot as plt
# plt.imshow(out, interpolation='bilinear')
# plt.axis("off")
# #plt.show()
# plt.savefig('phrases.png')

['planned model reuse', 'model composition and transformation', 'dsmls', 'restful services', 'software maintenance', ' software documentation', ' knowledge extraction and management', 'model transformation', 'cyber-physical system verification', 'machine learning', 'model-driven engineering', 'mde', 'variability', 'uncertainty', 'rationale', 'rationale knowledge extraction and management', 'certifiability of ml algorithms', 'safety on mission-critical systems', 'mlops', 'software ecosystem sustainability', 'human-centric aspects of software engineering', 'recommendation systems for software engineering', 'software traceability', 'documentation generation', 'software architecture and design', 'privacy engineering', 'sbse', 'ai for se', 'mde', 'developer tools', 'software language engineering', 'domain-specific languages', 're-engineering', 'mde', 'bpm', '', 'ml for se', 'sports application analysis', 'ai for se', 'software documentation', 'knowledge extraction', 'user behaviour/hci', 's

<wordcloud.wordcloud.WordCloud at 0x10f2c6b80>

Generate connection graph

In [None]:
from IPython.display import Image, display
import pydot

affil_colours = {
	'UdeM': "#006BB6",
	'ÉTS': "#EF3E45",
	'Poly': "#6AA84F",
	'UQAM': "#46BDC6",
	'McGill': "#F4B400",
	'Concordia': "#912338",
}

graph = pydot.Dot(name, graph_type='digraph')

mermaid_output = "graph TD\n"

mer_ids = {}
for member in reversed(member_keywords):
	col = affil_colours[member_affil[member]]

	text_color = 'black'
	mer_color = "#000000"
	if member_affil[member] in ["UdeM", "Concordia"]:
		text_color = 'white'
		mer_color = "#ffffff"
	n = pydot.Node(member, style="filled", color='black', fillcolor=col, fontcolor=text_color)
	#graph.add_node(n)

	member_id = member[:4].lower().strip()
	mer_ids[member] = member_id

	mermaid_output += "    " + member_id + "[" + member + "]\n"
	mermaid_output += "    style " + member_id + " fill:" + col + ",color:" + mer_color + "\n\n"

mk_list = list(member_keywords.keys())

mk_ids = {}
for i, mk in enumerate(set(collapsed_keywords)):
	if not mk:
		continue
	if freqs[mk] <= 1:
		continue

	mermaid_output += "    id" + str(i) + "(\"" + mk + "\")\n"
	mermaid_output += "    style " + "id" + str(i) + " fill:" + "#ffffff" + ",color:" + "#000000" + "\n\n"
	mk_ids[mk] = "id" + str(i)

mermaid_output += "\n"

for source_member in member_keywords:

	mer_src_member = mer_ids[source_member]

	for kw in member_keywords[source_member]:
		if not kw:
			continue
		if freqs[kw] <= 1:
			continue
		mermaid_output += "    " + mk_ids[kw] + " <--- " + mer_ids[source_member] + "\n"


	for target_member in member_keywords:

		if mk_list.index(target_member) <= mk_list.index(source_member):
			continue

		# print("Source: " + source_member)
		# print("Target: " + target_member)

		#print(member_keywords[source_member])
		sum = 0
		shared = []
		for smk in member_keywords[source_member]:
			if not smk:
				continue
			if smk in member_keywords[target_member]:
				sum += 1
				shared.append(smk)
				#print("Shared: " + smk)

		#if sum >0:
			#shared_keywords = " ".join(shared)
			#print(shared_keywords)
			#print(shared)



		# graph.add_edge(pydot.Edge(source_member, target_member, color='black', label=", ".join(shared), arrowhead="none", penwidth=sum))

filename = "connections"
# graph.write(filename + ".dot", prog='dot')
#
# (graph,) = pydot.graph_from_dot_file(filename + ".dot")
# graph.write_png(filename + '.png')

#plt = Image(filename + '.png')
#display(plt)

with open("mermaid_output.txt", 'w') as f:
	f.write(mermaid_output)

The wordcloud is exported to a [file](words.png):
![words](words.png)