In [None]:
import os
import json
import re
import datetime

import pandas as pd
import numpy as np
import pickle as pkl

from itertools import permutations
from matplotlib import pyplot as plt
from unidecode import unidecode
from google_trans_new import google_translator  

# Setup Google translator
translator = google_translator()

# Configuring matplotlib to output opaque images to avoid issues with Jupyter's dark mode
plt.rcParams.update({
    "axes.facecolor": "white",
    "figure.facecolor": "white",
})

# Configuration for parsing time data
selects = {
    "times": ["times"],
    "counts": ["counts"],
    "firsts": ["firsts"],
    "loc_mean": ["loc_means_stds", "mean"],
    "loc_std": ["loc_means_stds", "std"],
    "len_mean": ["elem_means_stds", "mean"],
    "len_std": ["elem_means_stds", "std"]
}

In [None]:
def json_data(path, indices, verbose=False):
    dat = []

    # Extract data from json files
    for dn in os.listdir(os.path.join(*path)):
        fns = os.listdir(os.path.join(*path, dn))
        
        if verbose:
            print(dn, "\t", len(fns) // 4)

        for fn in fns:
            if fn.endswith("json"):
                with open(os.path.join(*path, dn, fn), "r") as f:
                    dat.append(json.load(f))
                    
    dicts = []
    keys = set()

    # Reshaping json data
    for v in dat:
        reformed_dict = {}
        for outerKey, innerDict in v.items():
            if outerKey == "name":
                reformed_dict[(0, outerKey)] = innerDict
                continue
            elif outerKey == "feedback":
                outerKey = "16"

            for innerKey, values in innerDict.items():
                reformed_dict[(int(outerKey), innerKey)] = values
        
        # Store dict and gather all possible column names
        dicts.append(reformed_dict)
        keys |= set(reformed_dict.keys())                          
        
    df = pd.DataFrame(dicts, columns=keys)              # Create Dataframe from dicts
    df = df.sort_index(axis=1, level=0)                 # Sort columns
    df[(0, "name")] = df[(0, "name")].map(indices)      # Replace username with custom index values
    df = df.set_index((0, "name"))                      # Set it as index
    df = df.sort_index()                                # Sort index
    df = df.applymap(lambda e : None if e == [] else e) # Remove empty arrays
    df = df.dropna(axis=1, how='all', inplace=False)    # Drop useless columns
    df.index.name = None                                # Remove index name
        
    return df

In [None]:
# Define paths
data_path = ["data"]
json_path = data_path + ["raw"]
log_path = data_path + ["parsed"]

# Define steps of sim interactions
sim_steps = [2, 3, 14]

# Read session data
sessions = pd.read_csv(os.path.join(*data_path, "schedule.csv"))[["Session", "Day", "Study Level", "Field of Study", "Language", "Year", "Sim Version"]]
sessions["Session"] = sessions["Session"].map(lambda s : s.split(" ")[-1]).astype(int)
sessions = sessions.set_index("Session")

# Read DB data
df = pd.read_csv(os.path.join(*data_path, "users.csv"))

# Generate group-user mapping
groups = {}

for dn in os.listdir(os.path.join(*json_path)):
    idx = int(dn.split(" ")[1])
    
    for user in set([fn.split("-")[0] for fn in os.listdir(os.path.join(*json_path, dn))]):
        groups[user] = idx

# Get session IDs
df["session"] = df["username"].map(groups)

# Drop useless data
df = df.dropna().reset_index().drop(["id", "index", "test", "attempt", "created_at"], axis=1)

temp_df = df.copy()

# Filter out users that did not consent to data usage
# NOTE : This should not change anything since the data is supposed to be pre-filtered but is there for safety reasons
no_consent = df[df["consent"] == 0]

if len(no_consent):
    df = df[df["consent"] == 1]
    df = df.reset_index(drop=True)
    
    print("The following users were filtered out for lack of consent:")
    print(no_consent[["username", "session"]])

# Map session info
df["session"] = df["session"].astype(int)
df["date"] =    df["session"].map(sessions["Day"].to_dict())
df["year"] =    df["session"].map(sessions["Year"].to_dict())
df["level"] =   df["session"].map(sessions["Study Level"].to_dict())
df["field"] =   df["session"].map(sessions["Field of Study"].to_dict())
df["version"] = df["session"].map(sessions["Sim Version"].to_dict())
df["lang"] =    df["session"].map(sessions["Language"].to_dict())

# Convert date strings to datetime objects
df["date"] = pd.to_datetime(df["date"])

# Convert to 2-level index
df = pd.concat({'Base': df}, axis=1)

# Join with JSON data, ignoring users with no JSON data
indices = {v: k for (k, v) in df[("Base", "username")].to_dict().items()}
df = df.join(json_data(json_path, indices), how="right")

# Split versions
df_v1 = df[df[("Base", "version")] == 1]
df_v23 = df[df[("Base", "version")] > 1]

# Free space for column shift
df_v1 = df_v1.drop(range(9, 16), axis=1, level=0)

# Shift old question logs to indices of the newer version. Correspondences are the following:
# Version 1  :                 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 -   - 8 
# Version 2+ : 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 -13 -14 -15 -16 -17
for k0, k1 in df_v1.columns[::-1]:    
    if type(k0) == int and k0 < 9:
        k = k0 + 1 if k0 == 8 else k0
        k += 4
        df_v1[(k, k1)] = df_v1[(k0, k1)]
        df_v1.drop((k0, k1), axis=1, inplace=True)
        
# Shift old progress variables to indices of the newer version. Correspondences are the following:
# Version 1  : 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -   -12 -13 -   -   -14
# Version 2+ : 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 -13 -14 -15 -16 -17
prog_v1 = df_v1["Base"]["progress"].copy()
prog_v1[prog_v1 > 11] += 1
prog_v1[prog_v1 > 14] += 2
df_v1[("Base", "progress")] = prog_v1

# Merge back the two halves, this triggers a warning that can be ignored
df = pd.concat([df_v1, df_v23], axis=0)

# Rename "Base" to 0 for column sorting purposes, now that column 0 is gone
for k in df["Base"].columns:
    df[(0, k)] = df[("Base", k)]
    df.drop(("Base", k), axis=1, inplace=True)

# Delete empty strings and remove empty columns
df = df.replace("", np.nan)
df = df.dropna(axis=1, how="all")

# Access info about the minor status of the participants
with open(os.path.join(*data_path, "prefills.json"), "r") as f:
    prefills = json.load(f)

# Filter and order the info like the main index then add it to the dataframe
prefills = pd.concat([pd.Series(data=k, index=v) for k, v in prefills.items()]) \
                .reindex(index=df[(0, "username")]) \
                .reset_index(drop=True)
df[(0, "minor")] = prefills.notnull().astype('bool')

# Create a validity column, it is true when the participant is a minor and the data has been acquired less than 31 days ago, meaning the data should not be used yet
df[(0, "invalid")] = (df[0]["date"] > datetime.datetime.now() - datetime.timedelta(31)) & df[0]["minor"]

# Reunite all ranking data of step 4
ans_r = df[(4, "ranks")].apply(lambda l: list(map(int, l)) if type(l) == list else np.NaN)
ans_c = df[(4, "choices")].apply(lambda l: [ord(elem.split(" ")[1]) - 65 for elem in l] if type(l) == list else np.NaN)
df[(4, "ranks")] = pd.DataFrame(ans_r.fillna(ans_c))
df = df.drop((4, "choices"), axis=1)

# Load answer correction data
with open(os.path.join(*data_path, "ans_map.json"), "r") as f:
    ans_map = json.load(f)
    
# Define correct answer margin
margin = 0.05

# Process answer data
for c in [(k1, k2) for (k1, k2) in df.columns]:
    idx1 = str(c[0])
    idx2 = str(c[1])
    
    # Text cleanup
    if idx2 == "text":
        df[c] = df[c].apply(lambda e: e if type(e) == float else unidecode(e))                        # Remove accentuation
        df[c] = df[c].str.replace(r'([0-9]+),([0-9]+)', r'\1.\2', regex=True)                         # Replace commas with dots in decimal numbers
        df[c] = df[c].str.replace("\n", " ", regex=False)                                             # Remove newlines
        df[c] = df[c].str.replace("\r", " ", regex=False)                                             # Remove carriage returns
        df[c] = df[c].str.replace(" +", " ", regex=True)                                              # Remove double spaces
        df[c] = df[c].str.strip()                                                                     # Remove leading and trailing spaces
    
    # Search for answer columns
    if ans_map.get(idx1) is not None:
        q_type = ans_map[idx1]["type"]
        
        # Questions 5-8
        if idx2 == "text" and q_type == "num":
            # Set new columns
            df[(c[0], "ans")] = np.nan
            df[(c[0], "rel")] = np.nan

            # Read answer data
            init = ans_map[idx1]["init"]
            corr = ans_map[idx1]["correct"]

            # Process data
            processed = df[c].replace(ans_map[idx1]["map"])
            num_mask = processed.str.match(r'^-?\d+(?:\.\d+)$').fillna(False)
            temp = num_mask.index[num_mask]

            # Generate quantitative and qualitative data columns
            df.loc[temp, (c[0], "ans")] = processed[num_mask].apply(lambda n: float(n))
            df.loc[temp, (c[0], "rel")] = df[(c[0], "ans")][num_mask].apply(lambda n: 0 if abs(n - corr) < margin
                                                                                        else 1 if n > init
                                                                                        else -1)
        # Questions 9-10
        elif idx2 == "sliders" and q_type == "sliders":
            df[(c[0], "score")] = df[c[0]][idx2].apply(pd.Series) \
                                                .astype(float) \
                                                .multiply(ans_map[idx1]["correct"]) \
                                                .sum(axis=1)
        # Questions 11 & 13
        elif idx2 == "text" and "text" in q_type:
            # Compile data from the answer mapping
            temp = pd.json_normalize([{"user": k, "text": v["text"], "r": v["res"]} for k, v in ans_map[idx1]["map"].items()], sep="_").set_index("user")
            
            # Add index level and align index with existing dataset
            temp = pd.concat({c[0]: temp}, axis=1) \
                     .reindex(index=df[(0, "username")]) \
                     .reset_index(drop=True)
            
            # Join to the existing dataset and drop duplicate text column
            df = df.join(temp)
            df = df.drop((c[0], "text"), axis=1)
            
            # Extra for question 11, compute the score
            if q_type == "text4":
                cols = [col for col in df.columns if "r_" in col[1]]
                col = (c[0], "r_formula")
                df[(c[0], "score")] = pd.concat([df[col], (df[[col for col in df.columns if col[0] == c[0] and "r_" in col[1]]].sum(axis=1) - df[col]) / 3], axis=1).max(axis=1)
            
        # Question 12
        elif idx2 == "choices" and q_type == "notes":
            # Define column labels
            cols = [(c[0], "n_" + e) for e in ["text", "formula", "table", "diagram"]]
            
            # Turn arrays of answers into 4 columns while keeping NaNs for those who never answered the question
            df[cols] = df[c[0]][idx2].apply(pd.Series) \
                                     .applymap(lambda e: e if e is None or type(e) == float 
                                                           else len(e) > 0)
        # Question 15
        elif idx2 == "choices" and q_type == "select":
            # Turn column of lists into a DataFrame
            temp = df[c[0]][idx2].apply(lambda e: np.nan if e == [""] * 3 else e).apply(pd.Series)
            
            # Turn text answers into categories, "I don't know" answers are all at the end of the list and we do max(8, <value>) to map "I don't know" in all languages to the same value
            ans_cols = [(c[0], "a_" + str(i)) for i in range(3)]
            ans = temp.applymap(lambda e: np.nan if type(e) == float or len(e) == 0 
                                                 else str(min(8, ans_map[idx1]["choices"].index(e))))
            df[ans_cols] = ans
            
            # Count number of "I don't know" answers
            df[(c[0], "idk")] = (ans == "8").sum(axis=1)

            # Get answer correctness
            corr_cols = [(c[0], "c_" + str(i)) for i in range(3)]
            corr = temp.apply(lambda e: pd.Series([a == b for a, b in zip(e, ans_map[idx1]["correct"])]), axis=1)
            df[corr_cols] = corr

            # Get number of correct answers
            df[(c[0], "score")] = corr.sum(axis=1)
            
            # Set whole section row to NaN if initial value was NaN
            cols = [col for col in df.columns if col[0] == c[0] and col[1] != "time"]
            df.loc[df[c[0]]["a_0"].isnull(), cols] = np.nan
            
# Shift DataFrame columns to make room for the sim steps
shift_steps = [0] + sim_steps
df.columns = df.columns.set_levels([x + shift_steps.index(([0] + [s for s in shift_steps if s <= x])[-1]) for x in df.columns.levels[0]], level=0)

# Shift sim steps to fit in the gaps
sim_steps = [a + b for a, b in zip(sim_steps, range(len(sim_steps)))]
            
# Add one column per sim interaction to insert sim start times in
for step in sim_steps:
    df[(step, "time")] = np.nan

# Add sim start times to the DataFrame
for fn in os.listdir(os.path.join(*log_path)):                                               # Cycle through each session data file
    with open(os.path.join(*log_path, fn), "rb") as f:
        while True:                                                                          # Run as long as there are Dataframes in the file
            try:
                un, dfs = pkl.load(f)                                                        # Extract one triplet of DataFrames
                
                try:
                    for step, elem in zip(sim_steps, dfs.values()):                          # Execute for each sim step
                        try:
                            df.at[indices[un], (step, "time")] = elem["time"][0] / 1000      # Write time in column
                        except TypeError:                                                    # Exception: Sim was never started
                            pass
                except KeyError:                                                             # Exception: User is not in the DB
                    pass
            except EOFError:                                                                 # Exception: No more users to process for this session
                break
                
# Sort columns
df = df.sort_index(axis=1, level=0)

# Extract time columns and compute total time
time_cols = [col for col in df.columns if "time" in col[1]]
df_time = df[time_cols]
tot = df_time.max(axis=1) - df_time.min(axis=1)

# Compute time differences
df_time = df_time.diff(axis=1).shift(-1, axis=1).drop(df_time.columns[-1], axis=1)
df_time.columns = df_time.columns.droplevel(1)
df_time = pd.concat({"time": df_time}, axis=1)

# Store total time, recorded time and lost time
aggr = df_time.sum(axis=1)
df[("time", "total")] = tot
df[("time", "aggregate")] = aggr
df[("time", "lost")] = tot - aggr

# Store sim-specific time aggregates, and set to NaN for version 1
aggr_sim = df_time["time"][sim_steps].sum(axis=1)
aggr_sim[df[0]["version"] == 1] = np.nan
df[("time", "sim_aggregate")] = aggr_sim
df[("time", "sim_percent")] = aggr_sim / aggr

# Join back on main DataFrame and drop timestamps
df = df.drop(time_cols, axis=1)
df = df.join(df_time)

# Extract confidence columns and convert to float
conf_cols = [col for col in df.columns if "conf" == col[1]]
df_conf = df[conf_cols].applymap(float)

# Change column names
df_conf.columns = df_conf.columns.droplevel(1)
df_conf = pd.concat({"conf": df_conf}, axis=1)

# Compute average confidence, store back in main DataFrame and drop old confidence columns
df_conf[("conf", "avg")] = df_conf.mean(axis=1)
df = df.drop(conf_cols, axis=1)
df = df.join(df_conf)

# Turn feedback values into numbers
fdbk_cols = [col for col in df.columns if type(col[1]) == str and "s_" in col[1]]
df[fdbk_cols] = df[fdbk_cols].applymap(float)

# Extract time data
elems = dict()

with open(os.path.join(*data_path, "time_data.pkl"), "rb") as f:
    try:
        while True:
            uid, idx, data = pkl.load(f)
            
            if elems.get(uid) is None:
                elems[uid] = dict()
                
            elems[uid][idx] = data
    except EOFError:
        pass
    
# Assemble it into a DataFrame
for sim_idx in range(3):
    time_data = []
    
    for k, v in selects.items():
        # Select depth
        if len(v) == 2:
            temp_df = pd.DataFrame({uid: e.get(sim_idx)[v[0]][v[1]] for uid, e in elems.items() if e.get(sim_idx) is not None})
        else:
            temp_df = pd.DataFrame({uid: e.get(sim_idx)[v[0]] for uid, e in elems.items() if e.get(sim_idx) is not None})

        # Transpose and add index level
        temp_df = temp_df.T
        temp_df = pd.concat({k + "_" + str(sim_idx): temp_df}, axis=1)
        time_data.append(temp_df)

    # Concatenate DataFrames and match index with main dataframe, then join on it
    time_df = pd.concat(time_data, axis=1) \
                .reindex(index=df[(0, "username")]) \
                .reset_index(drop=True)
    
    df = df.join(time_df)
    
df = df[df[0]["invalid"] == False]

# Export clean data
with open(os.path.join(*data_path, "post_test.pkl"), "wb") as f:
    pkl.dump(df, f)
    
df

In [None]:
df.columns.levels[0]

In [None]:
df[0].columns

In [None]:
df[19]

In [None]:
len(df.columns)

# OLD

In [None]:
path = ["data", "raw"]
dat = dict()
for dn in sorted(os.listdir(os.path.join(*path)), key=lambda n : int(n.split(" ")[-1]))[:]:
    keys = None
    fns = os.listdir(os.path.join(*path, dn))
    print(dn, "\t", len(fns) // 4)
    
    for fn in fns:
        if fn.endswith("json"):
            username = fn.split("-")
            with open(os.path.join(*path, dn, fn), "r") as f:
                n_keys = json.load(f)
                
                try:
                    dat[n_keys["name"]] = n_keys
                except:
                    pass

In [None]:
df["time"]

In [None]:
df = pd.read_csv("data/users.csv")
df["Session"] = df["username"].map(groups)
df = df.dropna().reset_index().drop(["id", "index", "test", "attempt", "created_at"], axis=1)
df = df[df["consent"] == 1]
df["Session"] = df["Session"].astype(int)

uu = df

#df["year"] = df.rename(index=sessions.set_index('Session')['year']).index
df["year"] = df["Session"].map(sessions["Year"].to_dict())
#df["field"] = df["Session"].map(sessions["Field"].to_dict())
#df = df[df["Session"] > 2]

# TEMPORARY
df = df[df["progress"] > 15]

df = df.reset_index(drop=True)

df["json"] = df["username"].apply(lambda n: dat.get(n))

keys = ['1','2','3','5','6','7','8','9','10','11','12','13','14','15']

#time_abs = [test_data[k]["time"] - test_data["1"]["time"] for k in keys]
#time_rel = [a - b for a, b in zip(time_abs[1:], time_abs[:-1])]

df["time_abs"] = df["json"].apply(lambda d : [d[k]["time"] - d["1"]["time"] for k in keys])
df["time_rel"] = df["time_abs"].apply(lambda d : [a - b for a, b in zip(d[1:], d[:-1])])

df["avg_time"] = df["time_abs"].apply(lambda d : d[-1] / len(d))
df["time"] = df["time_abs"].apply(lambda d : d[-1])

df["conf"] = df["json"].apply(lambda d : [d[k].get("conf") for k in keys])

df["avg_conf"] = df["conf"].apply(lambda d : sum([int(e) for e in d if e]) / 6)

df[["fun", "hard"]] = pd.json_normalize(df["json"].apply(lambda d: d.get("feedback", {})))[["s_entertain", "s_difficult"]]
df["fun"] = pd.to_numeric(df["fun"])
df["hard"] = pd.to_numeric(df["hard"])
df

In [None]:
def proc_json(elem):
    ret = dict()
    
    for k in elem.keys():
        if k != "name":
            for k2 in elem[k].keys():
                if elem[k][k2] != []:
                    ret[k + "_" + k2] = elem[k][k2]

    return ret
                
proc_json(df["json"][40])

In [None]:
df_json = pd.json_normalize(df["json"].apply(lambda elem : proc_json(elem)))
df_extended = pd.concat([df.drop("json", axis=1), df_json], axis=1)



In [None]:
df["time"][range(1,20)]

fig = plt.figure(figsize=(16, 9))
d_min = 0 - 0.5
d_max = 12 + 0.5

times = df["time"][range(1,20)]
means = times.mean()

for i in times.columns:
    ax = plt.scatter([i,] * len(times[i]), times[i], s=2)
    ax = plt.scatter([i,], [means[i],], s=50, c="black", marker=",")

plt.xlim(d_min, d_max)
plt.grid(None)
        
plt.xlabel("Step in the experiment")
plt.ylabel("Time spent (seconds)")
fig.suptitle("Time spent per step in the experiment by year", fontsize=20)
fig.savefig("plots/time_per_step.png")

In [None]:
times = pd.DataFrame(df["time_rel"].to_list())

fig, ax = plt.subplots(figsize=(16, 9))

a_heights, a_bins = np.histogram(times[0], bins=20)
width = (a_bins[1] - a_bins[0])/30

for i in range(len(times.iloc[0])):
    b_heights, b_bins = np.histogram(times[i], bins=20)
    ax.bar(b_bins[:-1] + i * width, b_heights, width=width)
    
b_heights

In [None]:
fig = plt.figure(figsize=(16, 9))
d_min = 0 - 0.5
d_max = 12 + 0.5

times = pd.DataFrame(df["time_rel"].to_list())
means = times.mean()

for i in range(len(means)):
    ax = plt.scatter([i,] * len(times[i]), times[i], s=2)
    ax = plt.scatter([i,], [means[i],], s=50, c="black", marker=",")

plt.xlim(d_min, d_max)
plt.grid(None)
        
plt.xlabel("Step in the experiment")
plt.ylabel("Time spent (seconds)")
fig.suptitle("Time spent per step in the experiment by year", fontsize=20)
fig.savefig("plots/time_per_step.png")

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(16, 9))
d_min = 0 - 0.5
d_max = 12 + 0.5

for j, (a, year) in enumerate(zip(ax, sorted(df["year"].unique()))):
    times = pd.DataFrame(df[df["year"] == year]["time_rel"].to_list())
    means = times.mean()

    for i in range(len(means)):
        a.scatter([i,] * len(times[i]), times[i], s=2)
        a.scatter([i,], [means[i],], s=50, c="black", marker=",")

    a.set_xlim(d_min, d_max)
    a.set_title(year)
    a.grid("y")
    
    if j < 2:
        a.set_xticks([])
        
ax[2].set_xlabel("Step in the experiment")
ax[1].set_ylabel("Time spent (seconds)")
fig.suptitle("Time spent per step in the experiment by year", fontsize=20)
fig.savefig("plots/time_per_step_by_year.png")

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 9))
d_min = 0 - 0.5
d_max = 12 + 0.5

for j, (a, gender) in enumerate(zip(ax.flatten(), sorted(df["gender"].unique()))):
    times = pd.DataFrame(df[df["gender"] == gender]["time_rel"].to_list())
    means = times.mean()

    for i in range(len(means)):
        a.scatter([i,] * len(times[i]), times[i], s=2)
        a.scatter([i,], [means[i],], s=50, c="black", marker=",")

    a.set_xlim(d_min, d_max)
    a.set_title(["Male", "Female", "Other", "Unspecified"][j])
    a.grid("y")
    
    if j < 2:
        a.set_xticks([])

    if j % 2 == 0:
        a.set_ylabel("Time spent (seconds)")

    if j > 1:
        a.set_xlabel("Step in the experiment")

fig.suptitle("Time spent per step in the experiment by gender", fontsize=20)
fig.savefig("plots/time_per_step_by_gender.png")

In [None]:
confs = pd.DataFrame(df["conf"].to_list())[[3, 4, 5, 6, 9, 13]].fillna(0).astype(int)

fig, ax = plt.subplots(figsize=(16, 9))

a_heights, a_bins = np.histogram(confs[3], bins=10)
width = (a_bins[1] - a_bins[0])/10

for i, cat in enumerate([3, 4, 5, 6, 9, 13]):
    b_heights, b_bins = np.histogram(confs[cat], bins=a_bins)
    ax.bar(b_bins[:-1] + i * width + 2.5, b_heights, width=width, label="Question " + str(min(10, cat - 1)))
    
ax.legend()
ax.set_title("Confidence per question", fontsize=20)
ax.set_xlabel("Confidence (%)")
ax.set_ylabel("Number of people")
fig.savefig("plots/conf_per_q.png")

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 9))
    
d_min, d_max = 0, 100
binwidth = (d_max - d_min) / 10
bins = np.arange(d_min, d_max + binwidth, binwidth)

for i, (a, gender) in enumerate(zip(ax.flatten(), sorted(df["gender"].unique()))):
    confs = pd.DataFrame(df[df["gender"] == gender]["conf"].to_list())[[3, 4, 5, 6, 9, 13]].fillna(0).astype(int)
    
    a_heights, a_bins = np.histogram(confs[3], bins=10)
    width = (a_bins[1] - a_bins[0])/10
    
    for j, cat in enumerate([3, 4, 5, 6, 9, 13]):
        b_heights, b_bins = np.histogram(confs[cat], bins=a_bins)
        a.bar(b_bins[:-1] + j * width + 2.5, b_heights, width=width, label="Question " + str(min(10, cat - 1)))

    a.set_xlim(0, 100)
    a.set_title(["Male", "Female", "Other", "Unspecified"][i])

    if i % 2 == 0:
        a.set_ylabel("Number of people")

    if i > 1:
        a.set_xlabel("Confidence (%)")
        
ax[0][1].legend()
fig.suptitle("Confidence per question by gender", fontsize=20)
fig.savefig("plots/conf_per_q_by_gender.png")

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(16, 9))
    
d_min, d_max = 0, 100
binwidth = (d_max - d_min) / 10
bins = np.arange(d_min, d_max + binwidth, binwidth)

for i, (a, year) in enumerate(zip(ax, sorted(df["year"].unique()))):
    confs = pd.DataFrame(df[df["year"] == year]["conf"].to_list())[[3, 4, 5, 6, 9, 13]].fillna(0).astype(int)
    
    a_heights, a_bins = np.histogram(confs[3], bins=10)
    width = (a_bins[1] - a_bins[0])/10
    
    for j, cat in enumerate([3, 4, 5, 6, 9, 13]):
        b_heights, b_bins = np.histogram(confs[cat], bins=a_bins)
        a.bar(b_bins[:-1] + j * width + 2.5, b_heights, width=width, label="Question " + str(min(10, cat - 1)))

    a.set_xlim(0, 100)
    a.set_title(year)
    
    if i < 2:
        a.set_xticks([])
        
ax[2].set_xlabel("Confidence (%)")
ax[1].set_ylabel("Number of people")
ax[0].legend()
fig.suptitle("Confidence per question by year", fontsize=20)
fig.savefig("plots/conf_per_q_by_year.png")

1 - intro  
2 - sim 1  
3 - sim 2  
4 - Q ranking  
5 - Q text  
6 - Q text  
7 - Q text  
8 - Q text  
9 - Q sliders  
10 - Q sliders  
11 - Q text  
12 - Q checkboxes  
13 - Q text  
14 - sim 3  
15 - Q dropdown  
16 - feedback  
17 - end  

In [None]:
s = df["time_abs"]
pd.DataFrame.from_dict(dict(zip(s.index, s.values))).transpose().mean().plot()

In [None]:
s = df["time_rel"]
pd.DataFrame.from_dict(dict(zip(s.index, s.values))).transpose().mean().plot(kind="bar")

In [None]:
#UNUSED FOR NOW
params = [
    ("Confidence", "avg_conf", "%"),
    ("Enjoyment", "fun", "%"),
    ("Difficulty", "hard", "%"),
    ("Total time", "time", "seconds")
]

In [None]:
for name, ref, unit in params:
    d_min, d_max = df[ref].min(), df[ref].max()
    binwidth = (d_max - d_min) / 40
    bins = np.arange(d_min, d_max + binwidth, binwidth)
    
    fig, ax = plt.subplots(3, 1, figsize=(12, 9))
    df[ref].hist(by=df["year"], ax=ax, bins=bins, width=binwidth / 1.5)

    for i, a in enumerate(ax):
        a.set_xlim(0, df[ref].max())
        a.set_ylabel("Number of people")
        
        if i < 2:
            a.set_xticks([])

    ax[2].set_xlabel(name + " (" + unit + ")")
    fig.suptitle(name + ', by year', fontsize=20)
    fig.savefig("plots/" + ref + "_by_year.png")

In [None]:
for name, ref, unit in params:
    d_min, d_max = df[ref].min(), df[ref].max()
    binwidth = (d_max - d_min) / 20
    bins = np.arange(d_min, d_max + binwidth, binwidth)
    
    fig, ax = plt.subplots(2, 2, figsize=(12, 9))
    df[ref].hist(by=df["gender"].apply(lambda d : ["Male", "Female", "Other", "Unspecified"][int(d)-1]), ax=ax, bins=bins, width=binwidth / 1.5)

    for i, a in enumerate(ax.flatten()):
        a.set_xlim(0, df[ref].max())
        
        if i % 2 == 0:
            a.set_ylabel("Number of people")

        if i > 1:
            a.set_xlabel(name + " (" + unit + ")")

    fig.suptitle(name + ', by gender', fontsize=20)
    fig.savefig("plots/" + ref + "_by_gender.png")

In [None]:
from bokeh import palettes
from bokeh.io import output_file
from bokeh.plotting import figure, save, reset_output
from bokeh.models import ColumnDataSource, GroupFilter, CDSView, HoverTool

In [None]:
x = pd.DataFrame(df["json"].apply(lambda d : d.get("4")))
len([y for y in x["json"].to_list() if y is None])
len(x)

In [None]:
uu["progress"].value_counts()

In [None]:
temp = pd.DataFrame(df["json"].apply(lambda x : x.get("4", None)).to_dict()).transpose()
pd.Series(temp[temp["ranks"].astype(str) == "None"].index).hist(bins=129)

In [None]:
idx = 122
print(df.iloc[idx]["year"])
df.iloc[idx]["json"]["feedback"]

In [None]:
gy = pd.crosstab(df.gender,df.year)
gy.index = ["Male", "Female", "Other", "Unspecified"]
gy

In [None]:
plt.rcParams['axes.prop_cycle'].by_key()

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))

gender_labels = {
    1 : "Male",
    2 : "Female",
    3 : "Other",
    4 : "Unspecified"
}

years = ["1st", "2nd", "3rd"]
cmap = plt.rcParams['axes.prop_cycle'].by_key()["color"]

group_width = 0.75
bar_width = group_width / len(gender_labels)

for i, c in enumerate(cmap[:4]):
    heights = df[df[0]["gender"] == i + 1][0]["year"].value_counts()
    bins = heights.index.map(years.index)
    
    ax.bar(bins + (i + 0.5) * bar_width - group_width / 2 + 1, heights, width=bar_width, label=gender_labels[i + 1])
    
    for x, y in zip(bins, heights):
        ax.text(x + (i + 0.5) * bar_width - group_width / 2 + 1, y + 0.5, y, ha="center", c=c, size=16)
    
plt.xticks(ticks=range(1, len(years) + 1), labels=years)
ax.set_xlim(0.5, len(years) + 0.5)
ax.set_xlabel("Year of Study", size=16)
ax.set_ylabel("Number of people", size=16)
ax.legend(prop={'size': 16})
#fig.suptitle("Repartition of gender per study year", fontsize=20)
fig.savefig("plots/gender_by_year.png")

In [None]:
#ans_r = df_extended["4_ranks"].apply(lambda l : list(map(int, l)) if type(l) == list else np.NaN)
#ans_c = df_extended["4_choices"].apply(lambda l : [ord(elem.split(" ")[1]) - 65 for elem in l] if type(l) == list else np.NaN)
#ans = pd.DataFrame(ans_r.fillna(ans_c))

ans = df[6]["ranks"]

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct * total / 100.0))
        return '{p:.2f}%  \n({v:d})'.format(p=pct,v=val)
    return my_autopct

fig, ax = plt.subplots(1, 3, figsize=(16, 6))

for i, y in enumerate(years):
    values = ans[df_extended["year"] == y].isnull().value_counts()
    
    ax[i].pie(values,
        autopct=make_autopct(values),
        shadow=True,
        startangle=90)

    ax[i].set_title(y + " year")
fig.legend(["Collected", "Non collected"], prop={'size': 16})
#fig.suptitle("Success of the ranking collection", fontsize=20)
fig.savefig("plots/ranking_success.png")

In [None]:
for k1, g1 in rank_choices.groupby("year"):
    print(k1, "year")
    
    for k2, g2 in g1.groupby([0, 1, 2, 3]):
        print("\t", k2, len(g2), "<-------" if k2 == good_rank else "")

ordre largeurs 0 3 1 2 / 2 1 3 0
ordre concentrations 2 1 0 3 / 3 0 1 2

no order 8 permutations
vert vert rouge rouge 4 permutations
rouge rouge vert vert 2 0 1 3 / 0 2 3 1
0 2 1 3
bonne solution 2 0 3 1

OTHER FACTORS  

Concepts à comprendre :
Couleur (ORDRES DE MAGNITUDE PLUS IMPORTANT)
Largeur de bécher
Concentration

[2] = 4 / 3 * [0] >>>>>>>>>>>>>> [3] = 9 / 8 * [1]
 
 SEC = privilegier la concentration à la couleur, width not taken into account

COULEUR
- OK  
    CONCENTRATION
    - OK
        (2, 0, 3, 1)  21
    - INV
        (0, 2, 1, 3)  13
    - OTHER
        (2, 0, 1, 3)  11  
        (0, 2, 3, 1)  23
    - SEC
        (0, 3, 2, 1)   6
- INV  
    CONCENTRATION
    - OK
        (3, 1, 2, 0)  14
    - INV
        (1, 3, 0, 2)   1
    - OTHER
        (3, 1, 0, 2)   9  
        (1, 3, 2, 0)   4
    - SEC
        (3, 0, 1, 2)  17
- SEQ (looked at b * c only), similar to ok_inv et inv_ok
    - OK
        (3, 2, 1, 0)   5
    - INV
        (0, 1, 2, 3)   5
- SHADE ONLY
    - CONS
        - OK
            (2, 1, 0, 3)   2  
        - INV
            (1, 2, 3, 0)   1  
            
    - INCONS
        - OK
            (0, 3, 1, 2)   7  
            (3, 0, 2, 1)   4  
        - INV
            (2, 1, 3, 0)   6  
            (1, 2, 0, 3)   3  
- OTHER  
    (0, 1, 3, 2)       2  
    (1, 0, 2, 3)       1  
    (1, 0, 3, 2)       3  
    (2, 3, 0, 1)       2  
    (2, 3, 1, 0)       3  
    (3, 2, 0, 1)       3  

Tester des groupements en ignorant certains facteurs
Il y a bcp de variables

(2, 0, 1, 3) Mauvais calcul sur les verts
(3, 1, 0, 2) Ordre couleur inversé, sombres avant
(0, 2, 1, 3) Ordre couleur juste, sombres a l'exterieur au lieu de l'intérieur
(3, 1, 2, 0) Erreur de mettre les verts devant les rouges, les clairs au milieu, exact inverse de la bonne reponse
(3, 0, 1, 2) Erreur de favoriser l'opacité avant la couleur, mis les sombres d'abord puis les verts d'abord
(0, 2, 3, 1) Erreur de mettre les sombres systematiquement avant, ordre couleurs juste
(2, 0, 3, 1) Tout juste

Comprendre que les rouges vont devant car beaucoup plus d'absorption
Prendre en compte largeur et concentration

Green laser (520 nm)

2 5.0cm 400mM 2000 Light Red    13.4
0 2.5cm 600mM 1500 Dark Red     5.4
3 3.0cm 750mM 2250 Dark Green   
1 4.0cm 500mM 2000 Light Green

Check table by Jade

Bon ranking vis a vis de si ils ont utilisé la transmittance ou pas

In [None]:
reasonings = {
    "ok" : {
        "ok" : [(2, 0, 3, 1)],
        "inv" : [(0, 2, 1, 3)],
        "sec" : [(0, 3, 2, 1)],
        "other" : [(2, 0, 1, 3), (0, 2, 3, 1)]
    },
    "inv" : {
        "ok" : [(3, 1, 2, 0)],
        "inv" : [(1, 3, 0, 2)],
        "sec" : [(3, 0, 1, 2)],
        "other" : [(3, 1, 0, 2), (1, 3, 2, 0)]
    },
    "seq" : {
        "ok" : [(3, 2, 1, 0)],
        "inv" : [(0, 1, 2, 3)]
    }
}

In [None]:
outputs = dict()
levels = ["color", "concentration x width"]
covered = []
final = dict()
temp = {y: 0 for y in years}

for k1, v1 in reasonings.items():
    outputs[k1] = dict()
    
    for k2, v2 in reasonings[k1].items():
        covered += v2
        outputs[k1][k2] = temp.copy()
        s = 0
        
        for v in v2:
            rcs = rank_choices[(rank_choices[[0, 1, 2, 3]] == v).all(1)]
            vcs = rcs.value_counts("year")
            for y in years:
                outputs[k1][k2][y] += vcs.get(y, 0)
            s += len(rcs)
        outputs[k1][k2]["all"] = s
        
for k1, v1 in outputs.items():
    for k2, v2 in outputs[k1].items():
        final[k1 + "_" + k2] = v2
            
s = 0
            
others = [p for p in permutations(range(4)) if p not in covered]
final["other"] = temp.copy()
        
for v in others:
    rcs = rank_choices[(rank_choices[[0, 1, 2, 3]] == v).all(1)]
    vcs = rcs.value_counts("year")
    for y in years:
        final["other"][y] += vcs.get(y, 0)
    s += len(rcs)
final["other"]["all"] = s

final

In [None]:
sum([v["all"] for k, v in final.items()])

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
pd.DataFrame(final).T.plot(kind="bar", ax=ax)
ax.set_xlabel("Reasoning", size=16)
ax.set_ylabel("Number of people", size=16)
ax.legend(prop={'size': 16})
fig.suptitle("Repartition of reasonings per study year", fontsize=20)

In [None]:
for k1, g1 in rank_choices.groupby([0, 1, 2, 3]):
    vcs = g1["year"].value_counts()
    print(k1, vcs.sum(), vcs.to_dict())

In [None]:
for k1, g1 in rank_choices.groupby("year"):
    print(k1, "year")
    
    for k2, g2 in g1.groupby([0, 1, 2, 3]):
        print("\t", k2, len(g2), "<-------" if k2 == good_rank else "")

# NEED TO DO THE PLOT OF YEAR+GENDER BUT FOR ANSWER+YEAR INSTEAD
gender => year
year => answer

# Prendre les gens qui ont pas fini

Quantitatif
Valeure exacte

RQ


Qualitatif
compris que la mere est plus lourde mais a pas le nombre exact

Qualitatif > Rough Quantitatif > Quantitatif

3 niveaux de score : Qualitatif, rough quantitif, quantitatif

Faire des hypotheses sur le ranking et voir si le post test confirme ces hypotheses

In [None]:
valid_ans["4_ranks"].explode()

In [None]:
rank_choices[[0, 1, 2, 3]].drop_duplicates().agg(lambda x : tuple(x), axis=1).reset_index(drop=True)

In [None]:
df[(df["time"] < 0).sum(axis=1) > 0][[0, "time"]]

In [None]:
df[[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18]].columns

## Set default end time per session and check whether they stopped or couldnt finish

In [None]:
diffs = df_times.max(axis=1) - df_times.min(axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
cmap = plt.rcParams['axes.prop_cycle'].by_key()["color"]

_, g_bins = np.histogram(diffs[df.index], bins=50)

for i, y in enumerate(years):
    sub = diffs[df[df[0]["year"] == y].index]
    sizes, bins = np.histogram(sub, bins=g_bins)
    
    cnts = {x: 0 for x in bins}
    
    for s in sub:
        low_bins = [b for b in bins[:-1] if b <= s]
        last = low_bins[-1]
        total = sizes[len(low_bins)-1]
        odd = (len(low_bins) + total) % 2
        
        plt.plot(i + cnts[last] / 40 + odd/80 - total / 80, last, "o", c=cmap[i])
        cnts[last] += 1
        
    #ax.bar(bins, sizes)
    
    #ax.scatter([i] * len(sub), sub)

### Correct answers

```
4  - (2, 0, 3, 1)
5  - 0.37 (1/2x container)
6  - 1.59 (3x concentration)
7  - 0.96 (1/2x container, 2x concentration)
8  - 0.80 (4x container, 1/3x concentration)
9  - 
    YES - 0.01 - Blue
    OK  - 0.02 - Purple
    OK  - 0.02 - Yellow
    NO  - 0.20 - Red
10 - 
    YES - 0.00 - Cobalt nitrate
    OK  - 0.01 - Copper sulfate
    OK  - 0.05 - Potassium permanganate
    NO  - 0.48 - Potassium chromate
11 - Relationship
12 - Notes
13 - Knowledge of Law
15 - 
16 - Feedback
17 - End (might have to discard)
```

In [None]:
for k, v in ans_map["8"].items():
    if v == "?":
        print(k)
        print(translator.translate(k, lang_src="de", lang_tgt='en'))
        print()

In [None]:
for k, v in df[8]["text"].value_counts().to_dict().items():
    print("\"" + k + "\": \"?\",")


## CHECK WHY df[5]["text"] has less values 

In [None]:
230 - df[0]["progress"].value_counts().sort_index().cumsum()

## Remove the username that said TEST everywhere

In [None]:
for idx, row in df.iterrows():
    txt = row[11]["text"]
    
    if type(txt) == float:
        trans = txt
    else:
        trans = translator.translate(txt, lang_src="de", lang_tgt="en")
        
    elem = {
            "de": txt,
            "en": trans
        }
        
    ans_map["11"]["map"][row[0]["username"]]["text"] = elem

E = molar extinction coefficient  
C = substance concentration  
D = layer thickness of the cuvette

8ethqmkd  
6tgyhcuh  
w7asnymz  

Noter -1 quand ils ont inversé la proportionnalité

Calculer ratio utilisation transmittance/absorbance dans la sim  
Graph qui map les stratégies et l'utilisation de la transmittance

In [None]:
i = 0
d = dict()

for idx, row in df.iterrows():
    
    txt = row[13]["text"]
    
    if type(txt) == float:
        trans = txt
    else:
        trans = translator.translate(txt, lang_src="de", lang_tgt="en")
        
    elem = {
            "text": {
                "de": txt,
                "en": trans,
            },
            "res": None
        }
    
    u = row[0]["username"]
    
    d[u] = elem
        
    #ans_map["13"]["map"][row[0]["username"]]["text"] = elem

Break time
Break count
Avg break time
Active time
Active count
Avg active time


## NOTIFY IF THERE IS DATA FILES WITH NO MATCHING CODE

Get latest users.csv and regenerate auxiliary dfs

Also reorganize to have everything in one spot

Generate plots in other DF

In [None]:
for x in cat_15.value_counts().items():
    print(x)

Starting points:
```
- 1 :    Info 1
- 2 :    Sim 1
- 3 :    Info 2
- 4 :    Sim 2
- 5 :    Question 1 (ranking, absorption)
- 6 :    Question 2 (text, absorption)
- 7 :    Question 3 (text, absorption)
- 8 :    Question 4 (text, absorption)
- 9 :    Question 5 (text, absorption)
- 10 :   Question 6 (sliders, color)
- 11 :   Question 7 (sliders, solution)
- 12 :   Question 8 (text, proportionality)
- 13 :   Question 9 (checkboxes & text, notes)
- 14 :   Question 10 (text, previous knowledge)
- 15 :   Info 3
- 16 :   Sim 3
- 17 :   Question 11 (dropdown, solution)
- 18 :   Feedback
- 19 :   Info End
- 20 :   End
```

Time for step n = t_n - t_[n-1]

In [None]:
step_labels = [
    "Info 1",
    "Sim 1",
    "Info 2",
    "Sim 2",
    "Question 1 (ranking, absorption)",
    "Question 2 (text, absorption)",
    "Question 3 (text, absorption)",
    "Question 4 (text, absorption)",
    "Question 5 (text, absorption)",
    "Question 6 (sliders, color)",
    "Question 7 (sliders, solution)",
    "Question 8 (text, proportionality)",
    "Question 9 (checkboxes & text, notes)",
    "Question 10 (text, previous knowledge)",
    "Info 3",
    "Sim 3",
    "Question 11 (dropdown, solution)",
    "Feedback",
    "Info End"
]

In [None]:
end_cols = [col for col in df_time.columns if "time" == col[1]]
df_ver = df_time[df[0]["version"] > 1]
df_ver[end_cols].isna().idxmax(1).where(df_ver.isna().any(1)).value_counts().sort_index()

In [None]:
completed_count = df[0]["progress"].value_counts()[17]
completed_count

In [None]:
total_count = len(df)
total_count

In [None]:
completed_ratio = completed_count / total_count
completed_ratio * 100

In [None]:
df[0]["year"].value_counts()

In [None]:
df[0]["field"].value_counts()

In [None]:
df[0]["level"].value_counts()

In [None]:
levels = {
    "General (Low)": 0,
    "Apprenticeship (Mid)": 1,
    "Professional Maturity (High)": 2
}

levels

In [None]:
df[0][["year", "version"]].value_counts().unstack().plot(kind="bar", figsize=(16, 9))
plt.xticks(rotation=0)
plt.xlabel("Study Year")
plt.ylabel("Number of people")
plt.legend(title="Level")
plt.savefig("plots/lvl_per_year.png")

In [None]:
df[15]

In [None]:
df[12]

In [None]:
df[[(0, "gender"), (12, "score")]].value_counts().unstack().plot(kind="bar", figsize=(16, 9), cmap='RdYlBu')
plt.xticks(rotation=0)
plt.xlabel("Study Year")
plt.ylabel("Number of people")
plt.legend(title="Previous knowledge of Beer's Law", labels=["None", "Little", "Average", "Good", "Perfect"])
#plt.savefig("plots/beerknow_per_year.png")

In [None]:
df[[(0, "year"), (15, "r")]].value_counts().unstack().plot(kind="bar", figsize=(16, 9), cmap='RdYlBu')
plt.xticks(rotation=0)
plt.xlabel("Study Year")
plt.ylabel("Number of people")
plt.legend(title="Previous knowledge of Beer's Law", labels=["None", "Little", "Average", "Good", "Perfect"])
plt.savefig("plots/beerknow_per_year.png")

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
df[15]["r"].value_counts().sort_index().plot(kind="bar", figsize=(16, 9))
plt.xticks(range(5), ["Unknown", "", "Average", "", "Well known"], rotation=0)
plt.ylabel("Number of people")
plt.xlabel("Previous knowledge of Beer's Law")

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
df[15]["r"].hist(widths=[0.5] * 5, bins=[-1, -0.5, 0, 0.5, 1])

In [None]:
df[[c for c in df.columns if c[1] == "rel"]].applymap(lambda x: (1 if x == 0 else 0) if x is not None else None)

In [None]:
df[[c for c in df.columns if c[1] == "score"]]

In [None]:
df[0][["year", "level"]].value_counts().unstack()[levels.keys()].T.plot(kind="bar", figsize=(16, 9))
plt.xticks(rotation=90)

In [None]:
gender_map = {
    1: "Male",
    2: "Female",
    3: "Other",
    4: "Unspecified"
}

In [None]:
year_level = df[0][["year", "level", "gender"]].value_counts().unstack()
year_level = year_level.set_index(year_level.index.to_flat_index().map(" year\n".join).map(lambda x: x.split("(")[0]))
year_level.columns = year_level.columns.map(lambda a: gender_map[a])
ax = year_level.plot(kind="bar", figsize=(16, 9))
plt.legend(title="Gender")
plt.xticks(rotation=40, ha="right")
plt.xlabel("Formation & Study Year")
plt.ylabel("Student count")
plt.title("Gender repartition per formation and study year")
plt.savefig("plots/gender_year_level.png", bbox_inches='tight')

In [None]:
df[0][["year", "gender"]].value_counts().unstack().plot(kind="bar", figsize=(16, 9))

In [None]:
df[0][["level", "gender"]].value_counts().unstack().plot(kind="bar", figsize=(16, 9))

In [None]:
df[7]["ans"].hist()

In [None]:
def hex

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
cmap = plt.rcParams['axes.prop_cycle'].by_key()["color"]

_, g_bins = np.histogram(diffs[df.index], bins=50)

for i, y in enumerate(years):
    sub = diffs[df[df[0]["year"] == y].index]
    sizes, bins = np.histogram(sub, bins=g_bins)
    
    cnts = {x: 0 for x in bins}
    
    for s in sub:
        low_bins = [b for b in bins[:-1] if b <= s]
        last = low_bins[-1]
        total = sizes[len(low_bins)-1]
        odd = (len(low_bins) + total) % 2
        
        plt.plot(i + cnts[last] / 40 + odd/80 - total / 80, last, "o", c=cmap[i])
        cnts[last] += 1
        
    #ax.bar(bins, sizes)
    
    #ax.scatter([i] * len(sub), sub)

1 fichier par utilisateur nommé avec le uid

mapping ranking -> groupe pour 3 et 4 class case

liste des actions