In [None]:
# Python dependencies -- REQUIREMENT: Python >=3.9, <3.12
%pip install matplotlib scipy
%pip install setuptools wheel networkx
%pip install "git+https://github.com/tournesol-app/tournesol.git@solidago-pipeline#egg=solidago&subdirectory=solidago"

# If anything was installed, restart the notebook kernel

In [None]:
# Imports
import math
import time
import colorsys
import warnings

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick

from datetime import datetime
from matplotlib.axes import Axes
from dateutil.relativedelta import relativedelta
from solidago.pipeline.inputs import TournesolInputFromPublicDataset

In [None]:
PUBLIC_DATASET = TournesolInputFromPublicDataset.download()

In [None]:
def prepare_graph_pos(graph: nx.Graph, time_to_run:int, weight_key:str):
	## Preparing Graph Layout
	start = time.time()

	# Initialy put all nodes in a circle around the center
	pos=nx.circular_layout(graph)

	iterations_count=10
	total_iterations=0
	timer_a = time.time()
	loops_count = 0
	while timer_a - start < time_to_run:
		loops_count += 1
		# Move nodes towards eachother if connected, move them apart from eachother if not connected
		pos = nx.spring_layout(graph, pos=pos, weight=weight_key, iterations=iterations_count)
		total_iterations += iterations_count
		timer_b = time.time()
		speed = iterations_count / (timer_b-timer_a)
		expected_remaining_iterations = speed * (time_to_run - timer_b + start)
		print(f"Iterations: {total_iterations}/{total_iterations + expected_remaining_iterations:.0f} -- Time: {timer_b-start:.1f}/{time_to_run}s -- Speed: {speed:.1f}/s")
		next_iteration_count = int(math.ceil(expected_remaining_iterations / (10 - loops_count if loops_count < 10 else 1)))
		if loops_count > 10 or next_iteration_count > iterations_count*2 and loops_count > 1:
			# Spring Layout may stop iterating if found an equilibrium. Try to detect this event and stop before max_duration
			break
		# Prepare next iteration
		iterations_count = next_iteration_count
		timer_a = timer_b

	return pos

-----

# Plot User Graph

In [None]:
USER_TO_PLOT = 'NatNgs'

In [None]:
user_comparisons = (PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT]
                                  .loc[PUBLIC_DATASET.comparisons.criteria == 'largely_recommended']
                                  [['entity_a', 'entity_b']]
)
user_comparisons_list = zip(user_comparisons['entity_a'], user_comparisons['entity_b'])

graph = nx.Graph()
graph.add_edges_from(user_comparisons_list)
largest_group = max(nx.connected_components(graph), key=len)
graph.remove_nodes_from(n for n in list(graph.nodes) if not n in largest_group)
print('Loaded', graph)

In [None]:
pos = prepare_graph_pos(graph, time_to_run=120, weight_key='spring')

# Order nodes by color
colors_map = {n: len(graph[n]) for n in graph.nodes}
nodes = sorted(graph.nodes, key=colors_map.get)
sorted_graph = nx.Graph()
sorted_graph.add_nodes_from(nodes)
sorted_graph.add_edges_from(graph.edges.data())

# node color
min_c = min(colors_map.values())
mm_c = max(colors_map.values()) - min_c
print('min & max colors:', min_c, min_c + mm_c)

# Make colors from red(min) to green(max)
colors = [colorsys.hsv_to_rgb((colors_map[n]-min_c)/mm_c * (128/360), .9, .9) for n in nodes]

# Prepare image
plt.box(False)
plt.clf()
plt.tight_layout()
plt.rcParams['svg.fonttype'] = 'none'
plt.rc('axes', unicode_minus=False)

# Output svg dimensions
size = (sorted_graph.number_of_nodes()+1)**0.25
print(f"Image size: {size*1.4+1:.1f}x{size+1:.1f}")
fig = plt.figure(figsize=(size*1.4+1, size+1), frameon=False)

# Axis
fig.clear()
ax = fig.add_axes([0, 0, 1, 1])
ax.axis('off')
ax.set_facecolor('#FFF') # Background color

nodes_width = {n:d for n,d in graph.degree}
min_w = min(nodes_width.values())
mm_w = max(nodes_width.values()) - min_w
min_display = 1
mm_display = 25 - min_display
nx.draw_networkx_nodes(sorted_graph,
	pos=pos,
	nodelist=nodes,
	node_size=[min_display+mm_display*(nodes_width[n]-min_w)/mm_w for n in nodes],
	node_color=colors
)

nx.draw_networkx_edges(sorted_graph,
	pos=pos,
	edge_color='#8888',
	width=0.5,
)

warnings.filterwarnings("ignore", category=UserWarning)

-----

# Active Users

In [None]:
## User count over time
def usercount_graph():
	actives:dict[str,set[str]] = PUBLIC_DATASET.comparisons.groupby('week_date')['public_username'].unique().aggregate(set).to_dict() # date:{user, ...}
	account_creation:dict[str,str] = PUBLIC_DATASET.comparisons.groupby('public_username')['week_date'].min().to_dict() # user: date

	activesmonth: dict[str, set[str]] = dict() # date:{user, ...}
	l_dates: list[str] = sorted(actives.keys())
	for i in range(4,len(l_dates)):
		activesmonth[l_dates[i]] = actives[l_dates[i]].union(actives[l_dates[i-1]]).union(actives[l_dates[i-2]]).union(actives[l_dates[i-3]])

	one_year_ago = (datetime.now() - relativedelta(years=1, weeks=5)).isoformat()

	dates = [d for d in l_dates if d > one_year_ago]

	l_total: list[int] = [len([a for a in account_creation if account_creation[a] <= d]) for d in dates]
	l_actives_4: list[int] = [len(activesmonth[d]) for d in dates[1:]]
	l_actives: list[int] = [len(actives[d]) for d in dates[1:]]
	l_news: list[int] = [l_total[i] - l_total[i-1] for i in range(1,len(l_total))]
	l_datetimes: list[datetime] = [datetime.fromisoformat(k) for k in dates[1:]]
	l_total.pop(0)


	fig, ax = plt.subplots()
	fig.set_size_inches(14, 6)
	ax.set_yscale('log')
	ax.set_ylim(ymin=1, ymax=10**math.ceil(np.log10(l_total[-1])))
	ax.set_xlim(xmin=l_datetimes[1], xmax=l_datetimes[-1])
	ax.yaxis.set_ticks_position('right')
	ax.yaxis.set_major_formatter(mtick.ScalarFormatter())
	ax.yaxis.set_minor_formatter(mtick.ScalarFormatter())
	ax.yaxis.set_tick_params('minor', labelsize=7, labelcolor='gray')

	myFmt = mdates.DateFormatter('%Y-%m')
	ax.xaxis.set_ticks(pd.date_range(one_year_ago, datetime.now(), freq='MS'))
	ax.xaxis.set_major_formatter(myFmt)
	ax.xaxis.set_ticks(l_datetimes, minor=True)
	ax.xaxis.set_tick_params('minor', color='gray')
	ax.grid(visible=True, which='major', axis='y', color='gray')
	ax.grid(visible=True, which='minor', axis='y', color='lightgray')
	ax.grid(visible=True, which='major', axis='x', color='gray')
	ax.grid(visible=True, which='minor', axis='x', color='lightgray', linestyle=':')
	ax.plot_date(l_datetimes, l_total, xdate=True, color='blue', fmt='|--', label='Total users (min. 1cmp total) # Public dataset only #')
	ax.plot_date(l_datetimes, l_actives_4, xdate=True, color='red', fmt='|:', label='Monthly active users (min. 1cmp in the last 4 weeks)')
	ax.plot_date(l_datetimes, l_actives, xdate=True, color='orange', fmt='|--', label='Weekly active users (min. 1cmp in the week)')
	ax.plot_date(l_datetimes, l_news, xdate=True, fmt='|-', color='green', label='New users (first cmp ever)')
	ax.legend()
usercount_graph()

-----

# All users Videos vs Comparisons Scatter

In [None]:
## Users comparisons graph
plt.rc('axes', unicode_minus=False)
def users_cmp_graph():
	recom = PUBLIC_DATASET.get_comparisons(criteria='largely_recommended')
	mirrored = pd.concat([recom.rename(columns={'entity_a': 'vid', 'entity_b': 'comparedwith'}), recom.rename(columns={'entity_a': 'comparedwith', 'entity_b': 'vid'})], ignore_index=True)
	
	videos_per_user = recom.groupby('user_id')[['entity_a']].count().rename(columns={'entity_a': 'videos'})
	cmps_per_video_per_user = mirrored[['user_id', 'vid', 'comparedwith']].groupby(['user_id', 'vid']).count().groupby('user_id').mean().rename(columns={'comparedwith': 'averagecmps'})

	data = (videos_per_user.join(cmps_per_video_per_user, on='user_id')
	                       .join(PUBLIC_DATASET.users, on='user_id')
	                       .rename(columns={'videos': 'x', 'averagecmps': 'y'})
						   [['public_username', 'x', 'y']]
	)
	data = data[data.x > 1]
	sizes = data.groupby(['x', 'y']).count().rename(columns={'public_username': 's'})
	polyfit = data.groupby(lambda _: True).apply(lambda l: np.polyfit(np.log(l.x), l.y, 1))[True]

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)

	#ax.set_yscale('log')
	ax.set_xscale('log')
	ax.yaxis.set_major_formatter(mtick.ScalarFormatter())
	ax.xaxis.set_major_formatter(mtick.ScalarFormatter())
	
	ymax = data['y'].max()
	xmax = data['x'].max()
	ax.set_xlim(xmin=1, xmax=10**math.ceil(math.log10( xmax )))
	y_ticks_spacing = math.ceil(( ymax - data['y'].min() )/10)
	ax.yaxis.set_ticks(np.arange(0, (1+round(ymax/y_ticks_spacing))*y_ticks_spacing, y_ticks_spacing))
	ax.yaxis.set_ticks(np.arange(0, (1+round(ymax)), 1), minor=True)
	ax.set_axisbelow(True)

	sizes.reset_index().plot.scatter(x='x', y='y', c='blue', marker='.', s='s', label='Users', ax=ax)

	for _,row in data.iterrows():
		x= row['x']
		y= row['y']
		if y > 12 or x > 5000:
			ax.annotate(row['public_username'], (x, y), fontsize=5, color="#300")

	###########################################


	poly_xx = [2, xmax]
	poly_yy = [1, polyfit[0]*math.log(xmax)+polyfit[1]]
	ax.plot(poly_xx, poly_yy, color='#000', linewidth=1, label=f"Log. trend (y={polyfit[0]:0.2f}*x/log(x){polyfit[1]:+0.2f})")
	ax.set_ylim(ymin=0, ymax=math.floor(ymax)+1)


	# plt.title('How many comparisons every Tournesol users have done')
	ax.legend(loc='upper left')
	ax.set_ylabel('Average number of comparisons per video')
	ax.set_xlabel('Total number of video compared (Log. scale)')
	ax.grid(visible=True, which='major', axis='both', color='#888')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	
users_cmp_graph()

-----

# User Graph

In [None]:
# User cursors position
USER='NatNgs'

def user_histogram(ax: Axes, username: str, CRITERION: str, title: bool=False):
	votes = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == username].loc[PUBLIC_DATASET.comparisons.criteria == CRITERION].score

	ax.set_axisbelow(True)

	ax.set_xlim(xmin=-10.5, xmax=10.5)
	ax.xaxis.set_ticks(range(-10,11,1 if title else 5))
	ax.tick_params(axis='x', length=0)

	counts, bins = np.histogram(votes, bins=21)
	bins = [i/10.0 for i in range(-105,106,10)]
	counts_highlights = [(b if i%5 == 0 else 0) for i,b in enumerate(counts, -10)]
	counts_others = [(b if i%5 != 0 else 0) for i,b in enumerate(counts, -10)]

	ax.hist(bins[:-1], bins, weights=counts_others, align='mid', color='#0088AA')
	ax.hist(bins[:-1], bins, weights=counts_highlights, align='mid', color='#0022FF')
	for i in bins: # Plot white lines to separate columns
		ax.axvline(i, color='white')

	# plt.title('How many comparisons every Tournesol users have done')
	if title:
		ax.set_xlabel(CRITERION + ' by "' + USER + '"')
		ax.set_ylabel('Number of comparisons')
	else:
		ax.set_title(CRITERION)

	# Plot orange bell curve
	stdv = np.std(votes)*1.25
	reg_x = np.arange(-10.5, 10.5, 0.1)

	reg_y = np.exp(-np.square(reg_x/stdv)/2)/(stdv*np.sqrt(2*np.pi)) * len(votes)
	ax.plot(reg_x, reg_y, color='orange', alpha=.5)

	ax.set_ylim(bottom=0)



fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
user_histogram(ax, USER, 'largely_recommended', True)

fig, ax = plt.subplots(3,3)
user_histogram(ax[0][0], USER, 'reliability')
user_histogram(ax[0][1], USER, 'pedagogy')
user_histogram(ax[0][2], USER, 'importance')
user_histogram(ax[1][0], USER, 'layman_friendly')
user_histogram(ax[1][1], USER, 'entertaining_relaxing')
user_histogram(ax[1][2], USER, 'engaging')
user_histogram(ax[2][0], USER, 'diversity_inclusion')
user_histogram(ax[2][1], USER, 'better_habits')
user_histogram(ax[2][2], USER, 'backfire_risk')
fig.tight_layout()
fig.set_size_inches(8, 6)

-----

# Users Proximity graph

In [None]:
def users_proxim_graph():
	# PUBLIC_DATASET.comparisons is like: "public_username", "entity_a", "entity_b", with numeric index
	largely_recom = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.criteria == 'largely_recommended'][['public_username', 'entity_a', 'entity_b']]
	users_comparisons = largely_recom.groupby('public_username', as_index=False).count().rename(columns={'entity_a': 'cmps'})[['public_username', 'cmps']]
	# df is a subdataset of comparisons, with only users having a minimum of comparisons
	df = largely_recom[PUBLIC_DATASET.comparisons.public_username.isin(users_comparisons[users_comparisons.cmps >= 10].public_username)].reindex()

	# Make a new dataset with one row for every pair public_username, entity (with 2 rows, one for entity_a and one for entity_b)
	df = pd.concat([
		df.drop_duplicates(['public_username', 'entity_a'])[['public_username', 'entity_a']].rename(columns={'entity_a': 'entity'}),
		df.drop_duplicates(['public_username', 'entity_b'])[['public_username', 'entity_b']].rename(columns={'entity_b': 'entity'})
	])
	# Df is now like: "public_username", "entity"

	# df2 is like "user_a", "user_b", "entity" with one row for every pair of public_usernames having the same entity, when user_a is different than user_b
	df2 = df.merge(df, on='entity', how='inner').rename(columns={'public_username_x': 'user_a', 'public_username_y': 'user_b'})

	# Export edges as CSV: user_a,user_b,weight,dist
	df2.to_csv('edges.csv', index=False, header=True, sep=',')
	return

	df3 = df2.loc[df2.user_a < df2.user_b].groupby(['user_a', 'user_b']).count().reset_index().rename(columns={'entity': 'shared_entities'})

	# Draw graph having user_x as nodes, and count as the weight between user_a and user_b
	MIN_SHARED_ENTITIES=1 # Minimum value = 1 - Will ignore connexions between users having less than this amound of shared entities
	max_shared_entities = df3.shared_entities.max()
	graph = nx.Graph()
	for row in df3.itertuples():
		if row.shared_entities > MIN_SHARED_ENTITIES:
			graph.add_edge(row.user_a, row.user_b, weight=row.shared_entities/MIN_SHARED_ENTITIES, dist=MIN_SHARED_ENTITIES/row.shared_entities)
	print(graph)

	# Compute nodes positions
	print('Computing closeness centrality...')
	centrality = nx.closeness_centrality(graph, distance='dist')
	# pos = prepare_graph_pos(graph, 60, 'weight')

	print('Computing Kamada-Kawai layout...')
	pos = nx.kamada_kawai_layout(graph, center=(0,0), dist={n1:{n2:graph[n1][n2]['dist'] for n2 in graph[n1]} for n1 in graph.nodes})

	# Prepare image
	plt.box(False)
	plt.clf()
	plt.tight_layout()
	plt.rcParams['svg.fonttype'] = 'none'
	plt.rc('axes', unicode_minus=False)

	# Output svg dimensions
	size = 10
	print(f"Drawing {size*1.4:.1f}x{size:.1f} image...")
	fig = plt.figure(figsize=(size*1.4, size), frameon=False)

	# Colors of every node depends on betweeness centrality of the node
	mincolor = min(centrality.values())
	maxcolor = max(centrality.values())
	# Map colors from min to max => blue to red
	colors = [colorsys.hsv_to_rgb(.3-(centrality[n]-mincolor)/(maxcolor-mincolor)*.3, .9, .9) for n in graph.nodes]

	# Axis
	fig.clear()
	ax = fig.add_axes([0, 0, 1, 1])
	ax.axis('off')
	ax.set_facecolor('#FFF') # Background color


	# Center all pos around (0,0)
	minpos = (min(pos.values(), key=lambda x: x[0])[0], min(pos.values(), key=lambda x: x[1])[1])
	maxpos = (max(pos.values(), key=lambda x: x[0])[0], max(pos.values(), key=lambda x: x[1])[1])
	for n in pos:
		x = pos[n][0]-(maxpos[0]-minpos[0])/2
		y = pos[n][1]-(maxpos[1]-minpos[1])/2
		pos[n] = (
			# Also apply sqrt to the distance from (0,0) to the node
			math.sqrt(abs(x)) * (1 if x >= 0 else -1),
			math.sqrt(abs(y)) * (1 if y >= 0 else -1),
		)

	# Draw !
	warnings.filterwarnings("ignore", category=UserWarning)
	nx.draw_networkx_nodes(graph,
		pos=pos,
		node_size=2,
		node_color=colors,
	)
	nx.draw_networkx_edges(graph,
		pos=pos,
		edge_color='#888',
		alpha=[ (e[2]-(MIN_SHARED_ENTITIES-1))/(max_shared_entities-(MIN_SHARED_ENTITIES-1)) for e in graph.edges.data('weight')], # Alpha depends on connectivity: if equal MIN_SHARED_ENTITIES, alpha=0 / if equal max_shared_entities, alpha=1
		width=0.5,
	)

users_proxim_graph()
