# Statistiques d'utilisation des `publications`

- last_update: 2023-11-27

## objectif

Comprendre l'utilisation des publications par nos clients.

## TLDR

## setup

In [None]:
from pymongo.mongo_client import MongoClient
from utils import test_mongo_client, list_dbs, pull_db, RenderJSON
from utils import URI_CLUSTER_LOCAL, URI_CLUSTER_PROD
import pandas as pd
import plotly.express as px
import random
import seaborn as sns
import matplotlib.pyplot as plt
from enum import Enum

import plotly.express as px

## launch local instance mongodb
# launch-local-mongodb.sh

In [None]:
COLL_PUBLICATIONS = "course"

class PUB_TYPE(Enum):
    COURSE = "Publication.Course"
    SINGLE_MODULE = "Publication.SingleModule"
    ADAPTIVE = "Publication.Adaptive"
 
def get_pubs(client, db_name):
    # input: db_name
    # output: list of all publication
    res = list(client[db_name][COLL_PUBLICATIONS].find({}))
    for pub in res:
        pub["db_name"] = db_name
    return res

def get_pub_type(pub):
    # input: list of publications (dict)
    # the publication type 
    return pub["_cls"]

def get_pub_name(pub):
    # input: list of publications (dict)
    # the publication type 
    return pub["name"]


def is_pub_category(pub, cat: PUB_TYPE):
    return pub["_cls"] == cat.value


def get_pub_chapters(pub):
    # input: list of publications (dict)
    # the publication type
    return 

def get_pub_modules(pub):
    # input: list of publications (dict)
    # the publication type 
    modules = list()


def cast_to_string(obj):
    if isinstance(obj, dict):
        return {key: cast_to_string(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [cast_to_string(item) for item in obj]
    else:
        return str(obj)

def render_json(d, return_res=False):
    e = cast_to_string(d)
    r = RenderJSON(e)
    if return_res: return r

def render_json_random(l):
    i = random.choice(range(len(l)))
    render_json(l[i])


from pywaffle import Waffle

def plot_waffle(cats, vals, title, is_pct):
    pal_ = list(sns.color_palette(palette='plasma_r',
                              n_colors=len(cats)).as_hex())

    fig = plt.figure(FigureClass=Waffle, 
                    rows=20, columns=50,
                    values=vals, 
                    colors=pal_,
                    labels=[f"{c} ({str(round(v))+' %'*is_pct})" for (c,v) in zip(cats, vals)],
                    figsize = (15,6),
                    legend={'loc':'upper right',
                            'fontsize': 15
                            })
    plt.title(title, fontsize = 20, pad=20)
    plt.tight_layout()
    plt.show()


def plot_treemap(cats, vals, title, is_pct=False):
    fig = px.treemap(
        names = [f"{c} ({round(v)}{" %"*is_pct})" for (c,v) in zip(cats, vals)],
        parents = [""]*len(cats),
        values=vals
)
    fig.update_traces(root_color="white", textposition = 'middle center', legend="legend")
    fig.update_layout(title=title, margin = dict(t=50, l=25, r=25, b=25), font=dict(size=20), title_x=0.5, title_y=0.95)

    fig.update_layout(showlegend=True)
    fig.show()


def plot_proportion_bar(cats, vals, title, is_pct=False, show_legend=True):
    txt = [f"{c} ({str(round(v)) + is_pct*' %'})"for (c,v) in zip(cats, vals)]
    fig = px.bar(y=[""]*len(cats), x=vals, color=cats, orientation="h", text=txt)
    fig.update_layout(paper_bgcolor='white', plot_bgcolor='white')
    fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_yaxes(visible=False, showticklabels=False)
    fig.update_layout(title=dict(text=title, font=dict(size=30), x=0.4, y=0.95))
    fig.update_layout(legend_title=None, showlegend=show_legend)
    fig.update_traces(insidetextanchor="middle")
    fig.update_traces(width=0.5)
    fig.show()


In [None]:
## create mongo local client
client_local = MongoClient(URI_CLUSTER_LOCAL)
test_mongo_client(client_local)

#client_prod = MongoClient(URI_CLUSTER_PROD)
#test_mongo_client(client_prod)

client = client_local

In [None]:
## refresh databases
#_ = pull_db(URI_CLUSTER_PROD, URI_CLUSTER_LOCAL, "org_afev")

In [None]:
db_name = "org_amnesty"
#db_name = "org_albioma"
#pubs = get_pub(client, db_name)
#len(pubs)

In [None]:
db_org = [d["name"] for d in list_dbs(client) if d["name"].startswith("org_")]

all_pubs = list()
for db_name in db_org:
    all_pubs += get_pubs(client, db_name)
pub_course = [p for p in all_pubs if is_pub_category(p, PUB_TYPE.COURSE)]
pub_adaptive = [p for p in all_pubs if is_pub_category(p, PUB_TYPE.ADAPTIVE)]
pub_single_module = [p for p in all_pubs if is_pub_category(p, PUB_TYPE.SINGLE_MODULE)]

## analyse

#### quel est le nombre total d'espace client ?

In [None]:
nb_db_org = len(db_org)
print(f"Nombre d'espaces client: {nb_db_org}")

#### quel est le nombre total de publications ?

In [None]:
all_pubs = list()
for db_name in db_org:
    all_pubs += get_pubs(client, db_name)

nb_pub = len(all_pubs)
print(f"total number of publications: {nb_pub}")

### Types de publication

In [None]:
all_pub_type = [get_pub_type(p) for p in all_pubs]
all_pub_type = pd.Series(all_pub_type)
dist_pub_type = all_pub_type.value_counts().to_frame().reset_index(names="pub_type")

#### quel types de publications est-ce que Didask propose ?

In [None]:
print("types de publications:")
for e in all_pub_type.unique():
    print(f"- {e}")

#### quelle est la distribution des publications par type ?

In [None]:
dist_pub_type["pct"] = dist_pub_type["count"]/dist_pub_type["count"].sum()

In [None]:
dist_pub_type

In [None]:
ccc = dist_pub_type["pub_type"]
vvv = dist_pub_type["pct"]*100
ccc = [c.split(".")[1] for c in ccc]
plot_proportion_bar(cats=ccc, vals=vvv, title="Types de publication", is_pct=True, show_legend=False)

### mode de diffusion

- Les chiffres qui suivent sont restreints aux publications de type `Course`

#### quels sont les modes de diffusions possibles ?

In [None]:
diff_modes = pd.Series([p["diffusionMode"] for p in pub_course if ("diffusionMode" in p)])
print("modes de diffusion:")
for dm in diff_modes.unique():
    print(f"- {dm}")

#### quelle est la distribution par mode de diffusion ?

In [None]:
df_diff_modes = diff_modes.value_counts().to_frame()
df_diff_modes["pct"] = df_diff_modes["count"]/df_diff_modes["count"].sum()*100
df_diff_modes = df_diff_modes.reset_index(names='mode')
df_diff_modes
#df_diff_modes = df_diff_modes.melt(id_vars = "mode")


In [None]:
cats = df_diff_modes["mode"]
vals = df_diff_modes["pct"]
title = "Mode de diffusion des publications"
is_pct = True
show_legend = False
plot_proportion_bar(cats=cats, vals=vals, title=title, is_pct=is_pct, show_legend=show_legend)