In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [2]:
%config InlineBackend.figure_format = "retina"
sns.set()

In [3]:
df = pd.read_csv("data_character_meaning.csv", index_col=0,
                 dtype={"success": "boolean"},
                 parse_dates=['ts_display', 'ts_reply'])

In [6]:
df.sort_values(["user", "ts_display"], inplace=True)

In [7]:
df

Unnamed: 0,user,domain,condition,item,item_character,item_meaning,success,teacher_md,learner_md,psy_md,...,is_eval,ts_display,ts_reply,n_session_done,pos_reply_0,pos_reply_1,pos_reply_2,pos_reply_3,pos_reply_4,pos_reply_5
69531,acacia@active.fi,active.fi,ForwardCondition,351,老,elderly,True,forward,exp_decay,grid,...,False,2020-10-01 07:05:35.714000+00:00,2020-10-01 07:05:44.169000+00:00,6,elderly,flower,nourishing,obstruct,old woman,thunder
69532,acacia@active.fi,active.fi,ForwardCondition,273,局,bureau,True,forward,exp_decay,grid,...,False,2020-10-01 07:05:45.184000+00:00,2020-10-01 07:05:48.765000+00:00,6,bureau,elderly,highness,leg,old woman,slippery
69533,acacia@active.fi,active.fi,ForwardCondition,1375,拠,based on,True,forward,exp_decay,grid,...,False,2020-10-01 07:05:49.781000+00:00,2020-10-01 07:05:52.245000+00:00,6,based on,bureau,claw,elderly,government,vip
69534,acacia@active.fi,active.fi,ForwardCondition,1421,握,grip,True,forward,exp_decay,grid,...,False,2020-10-01 07:05:53.261000+00:00,2020-10-01 07:05:55.492000+00:00,6,based on,bureau,domesticate,elderly,grip,leg
69535,acacia@active.fi,active.fi,ForwardCondition,912,濃,thick,True,forward,exp_decay,grid,...,False,2020-10-01 07:05:56.492000+00:00,2020-10-01 07:05:58.606000+00:00,6,based on,bureau,elderly,grip,prolong,thick
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71676,vulture@active.fi,active.fi,ThresholdCondition,1656,珠,pearl,True,leitner,,grid,...,True,2020-10-07 15:53:39.446000+00:00,2020-10-07 15:53:43.800000+00:00,14,equilibrium,orphan,pearl,select,skin,stretch
71677,vulture@active.fi,active.fi,ThresholdCondition,736,脱,undress,True,leitner,,grid,...,True,2020-10-07 15:53:44.817000+00:00,2020-10-07 15:53:46.636000+00:00,14,arrest,carry out,inside,previously,skin,undress
71678,vulture@active.fi,active.fi,ThresholdCondition,1810,据,install,True,leitner,,grid,...,True,2020-10-07 15:53:47.638000+00:00,2020-10-07 15:54:01.750000+00:00,14,colleague,disaster,equilibrium,fly,install,right
71679,vulture@active.fi,active.fi,ThresholdCondition,198,首,neck,True,leitner,,grid,...,True,2020-10-07 15:54:02.768000+00:00,2020-10-07 15:54:05.187000+00:00,14,colleague,fasten,grate,neck,select,song


In [5]:
# Keep only users from the last experiment and that did it until the end (6 + 1  sessions for each teacher)
df.drop(df[(df.domain != "active.fi") | (df.n_session_done != 14)].index,
        inplace=True)

# Convert timestamps into seconds
beginning_history = pd.Timestamp("1970-01-01", tz="UTC")
df["timestamp"] = (df["ts_reply"] - beginning_history).dt.total_seconds().values

# Copy actual item ID in a new column
df["item_id"] = df.item
# Create new ids starting from zero
for i, i_id in enumerate(df.item_id.unique()):
    df.loc[df.item_id == i_id, 'item'] = i

# Total number of user
n_u = len(df.user.unique())

# Number of observations per user
n_o_by_u = np.zeros(shape=n_u, dtype=int)
for u, (user, user_df) in enumerate(df.groupby("user")):
    # Do not count first presentation
    n_o_by_u[u] = len(user_df) - len(user_df.item.unique()) 

# Total number of observation
n_obs = n_o_by_u.sum()

# Replies (1: success, 0: error)
y = np.zeros(shape=n_obs, dtype=int)
# Time elapsed since the last presentation of the same item (in seconds)
x = np.zeros(shape=n_obs, dtype=float)
# Number of repetition (number of presentation - 1)
r = np.zeros(shape=n_obs, dtype=int)
# Item ID
w = np.zeros(shape=n_obs, dtype=int)
# User ID
u = np.zeros(shape=n_obs, dtype=int)

# Fill the containers `y`, `x`, `r`, `w`, `u` 
idx = 0
for i_u, (user, user_df) in enumerate(df.groupby("user")):
    
    # Extract data from user `u`
    user_df = user_df.sort_values(by="timestamp")
    seen = user_df.item.unique()
    w_u = user_df.item.values
    ts_u = user_df.timestamp.values
    y_u = user_df.success.values
    
    # Initialize counts of repetition for each words at -1
    counts = {word: -1 for word in seen}
    # Initialize time of last presentation at None
    last_pres = {word: None for word in seen}
    
    # Number of observations for user `u` including first presentations
    n_obs_u_incl_first = len(user_df)
    
    # Number of repetitions for user `u`
    r_u = np.zeros(n_obs_u_incl_first)
    # Time elapsed since last repetition for user `u`
    x_u = np.zeros(n_obs_u_incl_first)  
    
    # Loop over each entry for user `u`:
    for i in range(n_obs_u_incl_first):
        
        # Get info for iteration `i`
        word = w_u[i]
        ts = ts_u[i]
        r_u[i] = counts[word]
        
        # Compute time elasped since last presentation
        if last_pres[word] is not None:
            x_u[i] = ts - last_pres[word]
        
        # Update count of repetition
        counts[word] += 1
        # Update last presentation
        last_pres[word] = ts
    
    # Keep only observations that are not the first presentation of an item
    to_keep = r_u >= 0
    y_u = y_u[to_keep]
    r_u = r_u[to_keep]
    w_u = w_u[to_keep]
    x_u = x_u[to_keep]
    
    # Number of observations for user `u` excluding first presentations
    n_obs_u = len(y_u)
    
    # Fill containers
    y[idx:idx + n_obs_u] = y_u
    x[idx:idx + n_obs_u] = x_u
    w[idx:idx + n_obs_u] = w_u
    r[idx:idx + n_obs_u] = r_u
    u[idx:idx + n_obs_u] = i_u
    
    # Update index
    idx += n_obs_u

n_w = len(np.unique(w))
n_o_max = n_o_by_u.max()
n_o_min = n_o_by_u.min()
print("number of user", n_u)
print("number of items", n_w)
print("total number of observations (excluding first presentation)", n_obs)
print("minimum number of observation for a single user", n_o_min)
print("maximum number of observation for a single user", n_o_max)

pd.DataFrame({
    'u': u,  # User ID
    'w': w,  # Item ID
    'x': x,  # Time elapsed since the last presentation of the same item (in seconds)
    'r': r,  # Number of repetition (number of presentation - 1)
    'y': y   # Replies (0: error, 1: success)
})

number of user 53
number of items 1998
total number of observations (excluding first presentation) 70618
minimum number of observation for a single user 1285
maximum number of observation for a single user 1404


Unnamed: 0,u,w,x,r,y
0,0,1537,61.980,0,1
1,0,1967,116.744,0,1
2,0,1488,115.660,0,1
3,0,1198,138.515,0,1
4,0,563,154.035,0,1
...,...,...,...,...,...
70613,52,296,410.185,9,1
70614,52,606,418.810,11,1
70615,52,807,407.315,14,1
70616,52,538,374.817,10,1


In [6]:
# Average number of repetitions
r.mean()

6.4676711320060045

In [7]:
# Correct recall frequency
y.mean()

0.8626837350250645

In [8]:
# Minimum/Maximum time between two presentations (in seconds)
x.min(), x.max()

(1.7849998474121094, 476175.66499996185)