In [None]:
import os, json
import pickle as pkl
import pandas as pd
import numpy as np

from operator import or_ as union
from functools import reduce, partial

from bokeh import palettes
from bokeh.io import output_file
from bokeh.plotting import figure, save, reset_output
from bokeh.models import ColumnDataSource, GroupFilter, CDSView, HoverTool

In [None]:
root_path = os.getcwd()
root_path

In [None]:
data_path = os.path.join(root_path, "data")
data_path

In [None]:
raw_path = os.path.join(data_path, "raw")
raw_path

In [None]:
parsed_path = os.path.join(data_path, "parsed")
parsed_path

In [None]:
with open(os.path.join(data_path, "entries.json"), "r") as f:
    mapping = json.load(f)

In [None]:
replacements = {
    "beers-law-lab_en-phetio.html?phetioStandalone&phetioLog=console:983 ": "",
    "\u200b": "",
    "[Intervention] Unable to preventDefault inside passive event listener due to target being treated as passive. See <URL>": ""
}

wanted_rows = ["phetioID", "event", "time", "parameters"]

In [None]:
entries = dict()

entries["general"] = [
    'ruler',
    'concentration',
    'wavelength',
    'container',
    'solution',
    'window',
    'probe',
    'sim',
]

entries["drags"] = [
    'ruler',
    'concentration',
    'wavelength',
    'container',
    'probe',
]

entries["laser"] = [
    'laser',
]

entries["pdf"] = [
    'pdf',
]

entries["modes"] = [
    'transmittance',
    'absorbance',
]

entries["other"] = [
    'concentration',
    'wavelength',
    'solution',
    'window',
    'sim',
]

palette = dict()

palette["general"] = dict(zip(entries["general"] + entries["modes"], palettes.Category10[10]))
palette["toggles"] = dict(zip(entries["pdf"] + entries["laser"],     palettes.Colorblind[8][6:]))

figure_kwargs = {
    'plot_width': 1200,
    'x_axis_label': 'Time (seconds)',
    'y_axis_label': 'UIDs',
    'toolbar_location': "below",
}

alpha = 0.7

common_kwargs = {
    'fill_alpha': alpha,
    'line_alpha': min(1, alpha + .2)
}

In [None]:
import os

# Quickly replace multiple strings based on a dict
def replace_all(string, rep):
    for k, v in rep.items():
        string = string.replace(k, v)
    return string

# Separate jsons from a raw log file string
def split_jsons(log):
    mem, res, fails = "", [], []
    elems = log.split("\n{")
    elems = [elems[0]] + ["\n{" + elem for elem in elems[1:]]
    
    for elem in elems:
        try:
            res.append(json.loads(elem))
        except:
            fails.append(elem)

    return res, fails

# Non tail-recursive flatten operation
# If keep_kids is True, retain only the head element
def flatten(entry, keep_kids=True):
    copy = entry.copy()
    children = copy.get("children")
    ret = [copy]
    
    if children:
        copy.pop("children")
        
        if keep_kids:
            for c in children:
                ret += flatten(c)
            
    return ret

# Parse a log file into a list of json elements
def parse_log_file(path, replacements, wanted_rows, keep_kids=True):    
    with open(path, "r", encoding="utf-8") as f:
        data = f.read()
        
    if len(data) == 0:
        return None, None
        
    data = replace_all(data, replacements)
    jsons, f = split_jsons(data)
    
    fails = dict()
    fails["json"] = f
    fails["flatten"] = []
    batch = []
    
    for elem in jsons:
        try:
            batch += flatten(elem, keep_kids)
        except:
            fails["flatten"].append(elem)
            
    return pd.DataFrame(batch)[wanted_rows], fails

# Gather all usernames found in a given path
def get_usernames(path):
    fns = os.listdir(path)
    uns = {fn.split("-")[0] for fn in fns}
    return uns

In [None]:
out_path = os.path.join(parsed_path, "{}.pkl")
mem_f = []
run = False

if run:
    for session in os.listdir(raw_path):
        s_path = os.path.join(raw_path, session)

        with open(out_path.format(session), "wb") as f_out:
            for uid in get_usernames(s_path):
                u_path = os.path.join(s_path, uid + "-{}.log")
                curr_data = dict()

                for exp_idx in range(3):
                    batch, fails = parse_log_file(u_path.format(exp_idx + 1), replacements, wanted_rows, keep_kids=False)
                    curr_data[exp_idx] = batch

                    if fails and (fails["json"] or fails["flatten"]):
                        print(session, "-", uid, exp_idx, ":", len(fails["json"]), "/", len(fails["flatten"]))
                        mem_f.append((session, uid, exp_idx, fails))

                pkl.dump((uid, curr_data), f_out)

In [None]:
uids = []

# REMOVE THIS
ds = []

with open(os.path.join(parsed_path, os.listdir(parsed_path)[10]), "rb") as f:
    while True:
        try:
            uid, d = pkl.load(f)
            ds.append(d)
            #if uid == "rve4n5nv":
                #break
            uids.append(uid)
        except EOFError:
            break

# Parsing logs

In [None]:
def reassign_previous(df, prev_idx):
    prev_a = df.loc[prev_idx, "action"]
    
    # Previous was a drag, make it a drag-end
    if prev_a == "drag":
        df.loc[prev_idx, "action"] = "drag-end"
    # Previous was a drag-start, just wipe it
    else:
        df.drop(prev_idx, inplace=True)

def fix_drags(drags, clear_dummies=True):
    obj, prev_idx = None, None
    
    if not len(drags):
        return drags
    
    for idx, row in drags.iterrows():
        n_act = row["action"]
        n_obj = row["object"]

        if "start" in n_act:
            # Previous was not ended, need to close it
            if obj is not None:
                reassign_previous(drags, prev_idx)

            # Memorize new object
            obj = n_obj

        elif "end" in n_act:
            # No matching start row, just wipe current row
            if obj != n_obj:
                drags.drop(idx, inplace=True)

                # Different previous object, need to close it
                if obj is not None:
                    reassign_previous(drags, prev_idx)

            # Reset memorized object
            obj = None

        else:
            # No matching start row, just make this the start row    
            if obj != n_obj:   
                drags.loc[idx, "action"] = "drag-start"

                # Different previous object, need to close it
                if obj is not None:
                    reassign_previous(drags, prev_idx)

            # Memorize object in any case
            obj = n_obj

        # Memorize previous index
        prev_idx = idx

    # Fix last entry if not an end
    drags.loc[prev_idx, "action"] = "drag-end"
    
    # Remove dummy entries if asked
    if clear_dummies:
        drags = drags[drags["action"] != "drag"]
    
    # Useless but avoids confusion for the reader
    return drags

def gen_drags(df):
    drags = df[df["action"].str.contains("drag")].copy()
    index = drags.index
    drags = fix_drags(drags)

    # Split starts and finishes and concatenate if not empty
    if len(drags):
        l = drags[::2][["object", "time"]].reset_index(drop=True)
        r = drags[1::2][["time"]].reset_index(drop=True)
        drags = pd.concat([l, r], axis=1)
        drags.columns = ["object", "start", "end"]

        # Define new columns
        drags["len"] = drags["end"] - drags["start"]
        drags["mid"] = (drags["end"] + drags["start"]) / 2
        drags = drags[["object", "len", "mid"]]
    else:
        drags = None

    return drags, index

In [None]:
def check_drags(drags):
    obj = None

    for idx, row in drags.iterrows():
        n_act = row["action"]
        n_obj = row["object"]

        if "start" in n_act:
            if obj is not None:
                print(idx, "start before end")

            obj = n_obj

        elif "end" in n_act:
            if obj is None:
                print(idx, "end without start")
            elif obj != n_obj:
                print(idx, "end of wrong obj")

            obj = None

        else:
            if obj is None:
                print(idx, "drag without start")
            elif obj != n_obj:
                print(idx, "drag of wrong obj")

            obj = n_obj

In [None]:
# Generate block positions and sizes
def process_toggles(df, duo=False):
    # Split starts and finishes and concatenate
    if duo:
        l = df.iloc[:-1][["params", "time"]]
        r = df.iloc[1:]["time"].reset_index(drop=True)
    else:
        l = df.iloc[::2][["params", "time"]].reset_index(drop=True)
        r = df.iloc[1::2]["time"].reset_index(drop=True)
        
    df = pd.concat([l, r], axis=1)
    df.columns = ["params", "start", "end"]

    # Define block positions and sizes
    df["len"] = df["end"] - df["start"]
    df["mid"] = (df["end"] + df["start"]) / 2
    df = df[["params", "len", "mid"]]

    return df

def gen_lasers(df):
    laser = df[df["object"] == "laser"].copy()
    index = laser.index
    laser.loc[:, "params"] = laser["params"].apply(lambda elem : elem["newValue"])
    laser.reset_index(drop=True, inplace=True)

    # Do nothing if empty
    if len(laser) and laser.iloc[-1]["params"]:
        end_time = df.iloc[-1]["time"]
        
        # Add last row
        laser.loc[len(laser)] = laser.iloc[-1]
        laser.iloc[-1, laser.columns.get_loc("time")] = end_time
        laser.iloc[-1, laser.columns.get_loc("params")] = False

    # Generate block positions and sizes
    laser = process_toggles(laser)
    laser = laser[laser["params"]].reset_index(drop=True)
    
    # Add label
    laser["object"] = "laser"

    return laser, index

# WIP Can be at least partially merged with gen_lasers
# WIP Need to check if other actions happen at the same time indicating that a pdf entry was dropped
def gen_pdfs(df):
    pdfs = df[df["object"] == "pdf"].copy()
    index = pdfs.index
    pdfs.reset_index(drop=True, inplace=True)

    # Do nothing if empty
    if len(pdfs) and pdfs.iloc[-1]["params"]:
        end_time = df.iloc[-1]["time"]
        
        # Add last row
        pdfs.loc[len(pdfs)] = pdfs.iloc[-1]
        pdfs.iloc[-1, pdfs.columns.get_loc("time")] = end_time
        pdfs.iloc[-1, pdfs.columns.get_loc("params")] = False

    # Generate block positions and sizes
    pdfs = process_toggles(pdfs)
    
    # Add label
    pdfs["object"] = "pdf"

    return pdfs, index

def gen_modes(df):
    mode = df[df["object"] == "mode"].copy()
    index = mode.index
    mode["params"] = mode["params"].apply(lambda elem : elem["value"] == "transmittance")
    mode.reset_index(drop=True, inplace=True)
    mode.index = mode.index + 1

    # Do nothing if empty
    if len(mode):
        end_time = df.iloc[-1]["time"]

        # Add first row
        mode.loc[0] = mode.loc[1]
        mode.iloc[-1, mode.columns.get_loc("time")] = 0
        mode.iloc[-1, mode.columns.get_loc("params")] = True

        # Add last row
        mode.loc[len(mode)] = mode.loc[0]
        mode.iloc[-1, mode.columns.get_loc("time")] = end_time
        mode.iloc[-1, mode.columns.get_loc("params")] = not mode.iloc[-1]["params"]

    # Delete consecutive identical entries
    mode.sort_index(inplace=True)
    mode = mode[mode['params'] != mode['params'].shift(-1)]
    mode.reset_index(drop=True, inplace=True)

    # Generate block positions and sizes
    mode = process_toggles(mode, duo=True)
    
    # Add label
    mode["object"] = mode["params"].apply(lambda elem : "transmittance" if elem else "absorbance")

    return mode, index

In [None]:
def gen_all(df, mapping):
    if df is None or not len(df):
        return None, df
    
    n_df = pd.DataFrame()
    ret = dict()

    # Normalize time
    n_df["time"] = (df["time"] - df["time"][0]) / 1000
    
    # Extract context info
    n_df[["object", "action"]] = df[["phetioID", "event"]] \
                                    .apply(lambda x : mapping \
                                        .get(x[0].split(".", 1)[1], dict()) \
                                        .get(x[1], "other-other"), axis=1) \
                                    .str.split("-", 1, expand=True)
    n_df["params"] = df["parameters"]
    n_df["params"] = n_df.apply(lambda elem : elem["params"] if elem["action"] == "toggle" else None, axis=1)
    
    # Process special categories
    ret["drags"], idx_d = gen_drags(n_df)
    ret["laser"], idx_l = gen_lasers(n_df)
    ret["modes"], idx_m = gen_modes(n_df)
    ret["pdf"],   idx_p = gen_pdfs(n_df)
    
    # Get the remaining entries
    idx_o = n_df.index.difference(idx_d
                                    .union(idx_l)
                                    .union(idx_m)
                                    .union(idx_p))
    ret["other"] = n_df.loc[idx_o].reset_index(drop=True)
    
    # Clean up the original df
    n_df = n_df[n_df["action"] != "drag"]
    
    return ret, n_df

# Plotting

In [None]:
def render_group(fig, dfs, entries, key, palette, name, h_pos, v_pos, v_shift, width, height):
    if dfs[key] is None or not len(dfs[key]):
        return
    
    cds = ColumnDataSource(dfs[key])
    
    for i, entry in enumerate(entries[key]):
        fil = [GroupFilter(column_name='object', group=entry)]
        view = CDSView(source=cds, filters=fil)

        rect_kwargs = {
            'name': name,
            'view': view,
            'source': cds,
            'color': palette[entry],
            'legend_label': entry,
            'x': h_pos,
            'y': v_pos + v_shift,
            'width': width,
            'height': height,
        }

        fig.rect(**common_kwargs, **rect_kwargs, muted_alpha=0.1)

In [None]:
def plot_on_figure(dfs, fig, name, idx):
    if dfs is None:
        return
    
    shift = idx
    render_group(fig, dfs, entries, "drags", palette["general"], name, 'mid',  0,   shift, 'len', .18)
    render_group(fig, dfs, entries, "laser", palette["toggles"], name, 'mid',  .35, shift, 'len', .08)
    render_group(fig, dfs, entries, "modes", palette["general"], name, 'mid',  .25, shift, 'len', .08)
    render_group(fig, dfs, entries, "pdf",   palette["toggles"], name, 'mid',  .15, shift, 'len', .08)
    render_group(fig, dfs, entries, "other", palette["general"], name, 'time', 0,   shift, .1,    .18)

In [None]:
def plot_by_task(path, mapping, session):
    figs, uids, s_id = [], [], 0
    tooltips = [("UID", "$name"), ('Component', "@object")]       
    
    for i in range(3):
        fig = figure(**figure_kwargs, title="Beer's Law Lab " + session + " - Task " + str(i + 1), sizing_mode='scale_width')
        figs.append(fig)
    
    with open(path, "rb") as f:
        while True:
            try:
                uid, data = pkl.load(f)
                uids.append(uid)
                
                for i in range(3):
                    dfs, _ = gen_all(data[i], mapping)
                    plot_on_figure(dfs, figs[i], uid, s_id)
                
                s_id += 1
            except EOFError:
                break
       
    for i in range(3):
        figs[i].yaxis.major_label_overrides = dict(zip(range(len(uids)), uids))
        figs[i].add_tools(HoverTool(tooltips=tooltips))
        figs[i].legend.click_policy = 'hide'
        output_file(os.path.join("plots", "bokeh", "by_task", session + " Task " + str(i + 1) + ".html"))
        save(figs[i])
        reset_output()

def plot_by_student(path, mapping, session):
    tooltips = [("Task", "$name"), ('Component', "@object")]
    
    with open(path, "rb") as f:
        while True:
            try:
                uid, data = pkl.load(f)
                fig = figure(**figure_kwargs, title="Beer's Law Lab " + session + " - UID " + uid, sizing_mode='scale_width')
                
                for i in range(3):
                    dfs, _ = gen_all(data[i], mapping)
                    plot_on_figure(dfs, fig, str(i + 1), i)
                    
                fig.yaxis.major_label_overrides = dict([(j, "Task " + str(j + 1)) for j in range(3)])
                fig.add_tools(HoverTool(tooltips=tooltips))
                fig.legend.click_policy = 'hide'
                output_file(os.path.join("plots", "bokeh", "by_student", session + " UID " + uid + ".html"))
                save(fig)
                reset_output()
            except EOFError:
                break

In [None]:
plot = False

if plot:
    for session in os.listdir(parsed_path):
        path = os.path.join(parsed_path, session)
        name = session.split(".")[0]
        print(name)
        plot_by_task(path, mapping, name)

In [None]:
plot = False
    
if plot:
    for session in os.listdir(parsed_path):
        path = os.path.join(parsed_path, session)
        name = session.split(".")[0]
        print(name)
        plot_by_student(path, mapping, name)

In [None]:
path = os.path.join(parsed_path, "Session 9.pkl")

dfs_list = {0 : [], 1 : [], 2 : []}

with open(path, "rb") as f:
    while True:
        try:
            uid, data = pkl.load(f)

            for i in range(3):
                dfs, orig = gen_all(data[i], mapping)
                dfs_list[i].append(dfs)
                
        except EOFError:
            break

# Time information extraction

In [None]:
def relax(elems_orig):
    elems = elems_orig.copy()
    
    for i, elem1 in enumerate(elems):
        for elem2 in elems[i+1:]:
            s1, e1 = elem1
            s2, e2 = elem2
            
            if not ((s1 > s2 and s1 > e2) or (e1 < s2 and e1 < e2)):
                elems.remove(elem1)
                elems.remove(elem2)
                new_elem = (min(s1, s2), max(e1, e2))
                elems = elems[:i] + [new_elem] + elems[i:]
                return elems, True
            
    return elems, False

def usetime_old(elems):
    if len(elems) == 0:
        return elems
    
    changed = True
    elems = sorted(elems, key=lambda e: e[0])
    
    while changed:
        elems, changed = relax(elems)
        
    return elems

In [None]:
def series_mean_std(series):
    vcnt = len(series)
    vsum = series.sum()
    vmean = vsum / vcnt

    vsum2 = (series ** 2).sum()
    vstd = (vsum2 / vcnt) - vmean ** 2
    
    return vmean, vstd

def mean_std_map(df, mids, lens=None):
    res = dict()
    
    for c in df["object"].unique():
        sub_df = df[df["object"] == c]
        mid_col = sub_df[mids]
        len_col = sub_df[lens] if lens else 0.5
        
        mean_up = (mid_col * len_col).sum()
        mean2_up = ((mid_col ** 2) * len_col).sum()
        mean_down = len_col.sum() if lens else 0.5 * len(mid_col)
        
        loc_dict = {
            "mean_up": mean_up,
            "mean2_up": mean2_up,
            "mean_down": mean_down
        }
        
        mean_down = len(mid_col)    
        
        if lens:
            mean_up = len_col.sum()
            mean2_up = (len_col ** 2).sum()
        else:
            mean_up = 0.5 * mean_down
            mean2_up = 0.25 * mean_down
        
        len_dict = {
            "mean_up": mean_up,
            "mean2_up": mean2_up,
            "mean_down": mean_down
        }
        
        res[c] = {
            "loc": loc_dict,
            "len": len_dict
        }
    
    return res

def mean_std_reduce(elems):
    temp = dict()
    res = dict()

    for elem in elems:
        for k in elem.keys():
            if temp.get(k) is None:
                temp[k] = elem[k].copy()
            else:
                for k2 in elem[k].keys():
                    for k3 in elem[k][k2]:
                        temp[k][k2][k3] += elem[k][k2][k3]

    for k, v in temp.items():
        sub = dict()

        for k2, v2 in v.items():
            if abs(v2["mean_down"]) < 0.000001:
                mean = 0
                mean2 = 0
            else:
                mean = v2["mean_up"] / v2["mean_down"]
                mean2 = v2["mean2_up"] / v2["mean_down"]

            # We use max to correct for floating point inaccuracies that would cause a negative variance in certain cases
            std = max(0, mean2 - (mean ** 2)) ** .5

            sub[k2] = {
                "mean": mean,
                "std": std
            }

        res[k] = sub   
    
    reform = {(innerKey, outerKey): values for outerKey, innerDict in res.items() for innerKey, values in innerDict.items()}
    reform = pd.DataFrame(reform)
    
    loc_df = reform["loc"].T
    len_df = reform["len"].T
    
    return loc_df, len_df

def usetime(elems):
    new_elems = []
    mem = elems[0]
    
    for elem in elems[1:]:
        s1, e1 = mem
        s2, e2 = elem
        
        if ((s1 > s2 and s1 > e2) or (e1 < s2 and e1 < e2)):
            new_elems.append(mem)
            mem = elem
        else:
            mem = (min(s1, s2), max(e1, e2))
            
    new_elems.append(mem)
            
    return new_elems

def first_uses(m, l, n):
    if len(m) == 0:
        return pd.DataFrame()
    
    gb = pd.concat([n, m - l], axis=1).groupby("object")
    gb = pd.concat([v.iloc[0:1] for k, v in gb])
    gb.columns = ["object", 0]
    
    return gb

def tc_drags(drags):
    m = drags["mid"]
    l = drags["len"] / 2
    n = drags["object"]
    
    active = pd.concat([m-l, m+l], axis=1)
    active = active.itertuples(index=False, name=None)
    
    firsts = first_uses(m, l, n)
    
    times = pd.Series(dict([(k, v["len"].sum()) for k, v in drags.groupby("object")]))

    return list(active), firsts, times

def tc_clicks(clicks, tpc):
    m = clicks["time"]
    n = clicks["object"]
    
    active = pd.concat([m, m+tpc], axis=1)
    active = active.itertuples(index=False, name=None)
    
    firsts = first_uses(m, 0, n)
    
    times = pd.DataFrame([(k, len(v) * tpc) for k, v in clicks.groupby("object")]).set_index(0)[1]
    times.index.name = None
    
    return list(active), firsts, times

def tc_rest(rest, tpc):
    m = rest["mid"]
    l = rest["len"] / 2
    n = rest["object"]

    temp = pd.concat([m-l, m+l], axis=1)
    starts = temp.iloc[:, 0]
    ends = temp.iloc[:, 1]

    starts = pd.concat([starts-tpc, starts+tpc], axis=1)
    ends = pd.concat([ends-tpc, ends+tpc], axis=1)
    ends.columns = [0, 0]
    active = pd.concat([starts, ends], axis=0, ignore_index=True).itertuples(index=False, name=None)
        
    firsts = first_uses(m, 0, n)
    
    if len(rest) == 0:
        times = pd.Series(dtype=np.float64)
    else:
        times = pd.DataFrame([(k, len(v) * 2 * tpc) for k, v in rest.groupby("object")]).set_index(0)[1]
        times.index.name = None
    
    return list(active), firsts, times

def time_cover(dfs, tpc):
    active = []
    firsts = []
    times = []
    means_stds = []
    
    dfs = {k: df for k, df in dfs.items() if df is not None and len(df) > 0}
    
    for k, df in dfs.items():
        if k == "drags" or k == "pdf":
            a, f, t = tc_drags(df)
        elif k == "other":
            a, f, t = tc_clicks(df, tpc)
        elif k == "modes":
            a, f, t = tc_rest(df.loc[1:], tpc)
        else:
            a, f, t = tc_rest(df, tpc)
            
        active += a
        firsts.append(f)
        times.append(t)
            
        if k == "other":
            ms = mean_std_map(df, "time")
        else:
            ms = mean_std_map(df, "mid", "len")
            
        means_stds.append(ms)
        
    active = usetime(active)
    active = sorted(active, key=lambda a: a[0])
    
    breaks = pd.DataFrame(active)
    breaks[0] = breaks[0][1:].reset_index(drop=True)
    breaks = breaks.iloc[:-1]
    
    ms_breaks = breaks.copy()
    ms_breaks["mid"] = (ms_breaks[1] + ms_breaks[0]) / 2
    ms_breaks["len"] = ms_breaks[0] - ms_breaks[1]
    ms_breaks["object"] = "breaks"
    ms_breaks = ms_breaks.drop([0, 1], axis=1)
    
    breaks = (breaks[0] - breaks[1]).apply(lambda x: 0 if x < 0 else x)
    
    ms_all = pd.DataFrame(active)
    ms_all["mid"] = (ms_all[1] + ms_all[0]) / 2
    ms_all["len"] = ms_all[1] - ms_all[0]
    ms_all["object"] = "all"
    ms_all = ms_all.drop([0, 1], axis=1)
    
    means_stds.append(mean_std_map(ms_breaks, "mid", "len"))
    means_stds.append(mean_std_map(ms_all, "mid", "len"))
    loc_means_stds, len_means_stds = mean_std_reduce(means_stds)
    
    firsts = pd.concat(firsts)
    firsts = firsts.sort_values(0).reset_index(drop=True)
    firsts = firsts.drop_duplicates(subset=["object"])
    firsts = firsts.set_index("object")[0]
    firsts.index.name = None
    
    counts = pd.concat(dfs[k]["object"].value_counts() for k in dfs.keys())
    counts = counts.groupby(level=0).sum()
    counts["breaks"] = len(breaks)
    
    adder = partial(pd.Series.add, fill_value=0)
    times = reduce(adder, times)
    times["breaks"] = sum(breaks)
            
    return {
        "firsts" : firsts,
        "counts" : counts,
        "times" : times,
        "loc_means_stds": loc_means_stds,
        "elem_means_stds": len_means_stds
    }
    
time_per_click = 0.5
a = time_cover(dfs, time_per_click)
a.keys()

In [None]:
def fix_dfs(dfs):
    res = []
    
    for k, df in dfs.items():
        if k == "other":
            x = df.copy()
            x["mid"] = x["time"]
            x["len"] = 0.5
            x = x.drop(["action", "params", "time"], axis=1)
            
            res.append(x)
        else:
            res.append(df)
            
    res = pd.concat(res, axis=0) \
            .drop(["params"], axis=1) \
            .sort_values("mid", axis=0) \
            .dropna(how="any", axis=0) \
            .reset_index(drop=True)
    
    obj = set(res["object"].unique())
    
    return res, obj

#with open("jade_test.pkl", "wb") as f:
#    pkl.dump(fix_dfs(dfs), f)
    
#fix_dfs(dfs).to_csv("jade_test.csv")
fix_dfs(dfs)

In [None]:
def df_merge(df0, df1):
    
def df_variants(df0, df1):
    df0["extra"] = 0
    df1["extra"] = 1
    
    comb = pd.concat([df0, df1], axis=0).reset_index(drop=True)
    comb_str = comb.copy()
    comb_str["object"] = comb_str["object"] + comb_str["extra"].astype(str)
    comb_str = comb_str.drop("extra", axis=1)
    
    return comb, comb_str

In [None]:
a, b = fix_dfs(dfs)
c, d = fix_dfs(dfs)
e, f = df_variants(a, c)
pd.concat([None, f], axis=0)

## SPECIAL

In [None]:
all_obj = set()

for session in os.listdir(parsed_path):
    print(session)
    with open(os.path.join(parsed_path, session), "rb") as f_in:
        j = 0
        try:
            while True:
                try:
                    uid, data = pkl.load(f_in)
                    print("\t" + str(j) + " - " + uid, end="\r")
                    
                    curr = []

                    for i in range(3):
                        dfs, _ = gen_all(data[i], mapping)

                        if dfs is not None and i < 2:
                            df, obj = fix_dfs(dfs)
                            df["extra"] = i
                            all_obj |= obj
                            curr.append(df)
                            #df.to_csv(os.path.join("csv", uid + ".csv"))
                            
                    if len(curr) == 2:
                        comb = pd.concat(curr, axis=0).reset_index(drop=True)
                        comb_str = comb.copy()
                        comb_str["object"] = comb_str["object"] + comb_str["extra"].astype(str)
                        comb_str = comb_str.drop("extra", axis=1)
                        
                        comb.to_csv(os.path.join("csv_extra", uid + ".csv"))
                        comb_str.to_csv(os.path.join("csv_merge", uid + ".csv"))
                        
                except EOFError:
                    break
                j += 1
        except KeyboardInterrupt:
            print("Interrupted!")
                
print("Finished!")

## NORMAL

In [None]:
all_obj = set()

for session in os.listdir(parsed_path):
    print(session)
    with open(os.path.join(parsed_path, session), "rb") as f_in:
        j = 0
        try:
            while True:
                try:
                    uid, data = pkl.load(f_in)
                    print("\t" + str(j) + " - " + uid, end="\r")

                    for i in range(3):
                        dfs, _ = gen_all(data[i], mapping)

                        if dfs is not None and i == 1:
                            #curr_data = time_cover(dfs, time_per_click)
                            #pkl.dump((uid, i, curr_data), f_out)
                            df, obj = fix_dfs(dfs)
                            all_obj |= obj
                            df.to_csv(os.path.join("csv", uid + ".csv"))
                except EOFError:
                    break
                j += 1
        except KeyboardInterrupt:
            print("Interrupted!")
                
print("Finished!")

In [None]:
list(all_obj)

In [None]:
with open("objects.txt", "w") as f:
    f.write("\n".join(list(all_obj)))

### SOLUTION SHOULD BE TREATED AS A DRAG SINCE WE OPEN AND CLOSE THE LIST

# NOTES

Might wanna check if all message indices are here in the future

Make modules work `from helpers import parsing as par`

Tester toutes les entrées PHET qui existent dans les logs

Report on the number of wrong drag entries

Verifier le parsing du laser

Mean and standard deviation of each component

Next week goal is to have plots that show us for each feature how they are distributed between the different students in order to choose the RF features

Demo de défense avec labo possible

Lire fonctions depuis des helper files

In [None]:
## EXTRACTION OF TIME RELATED DATA

with open(os.path.join(data_path, "time_data.pkl"), "wb") as f_out:
    for session in os.listdir(parsed_path):
        print(session)
        with open(os.path.join(parsed_path, session), "rb") as f_in:
            j = 0
            try:
                while True:
                    try:
                        uid, data = pkl.load(f_in)
                        print("\t" + str(j) + " - " + uid, end="\r")
                        
                        for i in range(3):
                            dfs, _ = gen_all(data[i], mapping)

                            if dfs is not None:
                                curr_data = time_cover(dfs, time_per_click)
                                pkl.dump((uid, i, curr_data), f_out)
                    except EOFError:
                        break
                    j += 1
            except KeyboardInterrupt:
                print("Interrupted!")
                
print("Finished!")