In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
pwd

'/Users/cock/kDrive/PhD/Projects/students/ml4science/notebooks/1 - data processing'

# Data

In [3]:
# Beer's Law Data
path = '/Users/cock/kDrive/PhD/Projects/labs/beerslaw-lab/data/beerslaw/sequenced_simulations/simplestate_secondslstm/id_dictionary.pkl'
with open(path, 'rb') as fp:
    idd = pickle.load(fp)

root = '/Users/cock/kDrive/PhD/Projects/labs/beerslaw-lab'

In [4]:
# Beer's Law reproducibility
sim_pm = '/Users/cock/kDrive/PhD/Projects/labs/behavioural-pattern-mining/data/chemlab/beerslaw/sim_dictionary.pkl'
with open(sim_pm, 'rb') as fp:
    simpm = pickle.load(fp)
learners = {simpm['sequences'][idx]['learner_id']: simpm['sequences'][idx]['language'] for idx in simpm['sequences']}

In [5]:
# Create new dictionary
new_dic = {
    'sequences': {},
    'index': {
        'learner_idx': {},
        'idx_learner': {}
    }
}
new_index = 0
for seq_index in idd['sequences']:
    if idd['sequences'][seq_index]['learner_id'] in learners:
        path = idd['sequences'][seq_index]['path'].replace('..', root)
        with open(path, 'rb') as fp:
            curr_seq = pickle.load(fp)

        new_dic['sequences'][new_index] = dict(idd['sequences'][seq_index])
        new_dic['sequences'][new_index]['raw_sequence'] = curr_seq['sequence']
        new_dic['sequences'][new_index]['raw_begin'] = curr_seq['begin']
        new_dic['sequences'][new_index]['raw_end'] = curr_seq['end']
        new_dic['sequences'][new_index]['last_timestamp'] = curr_seq['last_timestamp']
        new_dic['sequences'][new_index]['permutation'] = curr_seq['permutation']
        new_dic['sequences'][new_index]['gender'] = curr_seq['gender']
        new_dic['sequences'][new_index]['year'] = curr_seq['year']
        new_dic['sequences'][new_index]['language'] = learners[idd['sequences'][seq_index]['learner_id']]

        new_dic['index']['learner_idx'][idd['sequences'][seq_index]['learner_id']] = new_index
        new_dic['index']['idx_learner'][new_index] = idd['sequences'][seq_index]['learner_id']
        new_index += 1
        

# Impute all breaks

In [6]:
def get_all_breaks(begin, end):
    b = begin + [0]
    e = [0] + end 
    
    breaks = list(np.array(b) - np.array(e))
    breaks = breaks[:-1]
    
    breaks = [b for b in breaks if b > 0]
    
    return breaks

def get_threshold(begins, ends, threshold=0.6):
    begin = [b for b in begins]
    end = [e for e in ends]
    breaks = get_all_breaks(begin, end)
    if len(breaks) == 0:
        return 0
    breaks.sort()
    threshold = int(np.floor(threshold * len(breaks)))
    threshold = breaks[threshold]
    return threshold

def create_break_vector(state, break_time):
    break_vector = [s for s in state] + [0 for _ in range(5)] + [break_time]
    return break_vector

def impute_breaks(begins, ends, labels):
    assert len(begins) == len(ends) and len(ends) == len(labels)
    threshold = get_threshold(begins, ends)
    
    bs = []
    es = []
    ls = []

    if begins[0] > threshold:
        l = create_break_vector([0, 0, 0, 1], begins[0])
        bs.append(0)
        es.append(begins[0])
        ls.append(l)

    for i in range(len(labels) - 1):
        bs.append(begins[i])
        es.append(ends[i])
        ls.append(labels[i])

        if begins[i+1] - ends[i] > threshold:
            break_v = create_break_vector(
                ls[-1][:4], begins[i+1] - ends[i]
            )
            bs.append(ends[i])
            es.append(begins[i+1])
            ls.append(break_v)

    assert len(bs) == len(es) and len(es) == len(ls)
    return bs, es, ls

In [7]:
for lid in new_dic['sequences']:
    for ts in range(len(new_dic['sequences'][lid]['raw_sequence'])):
        if np.sum(new_dic['sequences'][lid]['raw_sequence'][ts][0:4]) < 1:
            print('student {}, timestep {}, \n    {}\n    {}'.format(
                lid, ts, new_dic['sequences'][lid]['raw_sequence'][ts],
                lid, ts, new_dic['sequences'][lid]['raw_sequence'][ts-1]
            ))

student 20, timestep 0, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05000000000000071, 0.0, 0.0]
    20
student 28, timestep 0, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.050000000000000044, 0.0, 0.0]
    28
student 28, timestep 88, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05000000000001137, 0.0, 0.0]
    28
student 28, timestep 97, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5699999999999932, 0.0, 0.0]
    28
student 51, timestep 107, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 71.87599999999998, 0.0, 0.0]
    51
student 120, timestep 4, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 108.27699999999999, 0.0, 0.0]
    120
student 120, timestep 6, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 47.56800000000001, 0.0, 0.0]
    120
student 120, timestep 9, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05000000000001137, 0.0, 0.0]
    120
student 174, timestep 18, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05000000000001137, 0.0, 0.0]
    174
student 250, timestep 23, 
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [8]:
new_dic['sequences'][20]['raw_sequence'][0][3] = 1.0
new_dic['sequences'][28]['raw_sequence'][0][3] = 1.0
new_dic['sequences'][28]['raw_sequence'][88][:4] = new_dic['sequences'][28]['raw_sequence'][87][:4]
new_dic['sequences'][28]['raw_sequence'][97][:4] = new_dic['sequences'][28]['raw_sequence'][96][:4]
new_dic['sequences'][51]['raw_sequence'][107][:4] = new_dic['sequences'][51]['raw_sequence'][106][:4]
new_dic['sequences'][120]['raw_sequence'][4][:4] = new_dic['sequences'][120]['raw_sequence'][3][:4]
new_dic['sequences'][120]['raw_sequence'][6][:4] = new_dic['sequences'][120]['raw_sequence'][5][:4]
new_dic['sequences'][120]['raw_sequence'][9][:4] = new_dic['sequences'][120]['raw_sequence'][8][:4]
new_dic['sequences'][174]['raw_sequence'][18][:4] = new_dic['sequences'][174]['raw_sequence'][17][:4]
new_dic['sequences'][250]['raw_sequence'][23][:4] = new_dic['sequences'][250]['raw_sequence'][22][:4]
new_dic['sequences'][250]['raw_sequence'][25][:4] = new_dic['sequences'][250]['raw_sequence'][24][:4]

In [9]:
for lid in new_dic['sequences']:
    for ts in range(len(new_dic['sequences'][lid]['raw_sequence'])):
        if np.sum(new_dic['sequences'][lid]['raw_sequence'][ts]) < 1:
            print('student {}, timestep {}, \n    {}\n    {}'.format(
                lid, ts, new_dic['sequences'][lid]['raw_sequence'][ts],
                lid, ts, new_dic['sequences'][lid]['raw_sequence'][ts-1]
            ))

In [10]:
new_dic['sequences'][28]['raw_sequence'][95:99]

[[0.0, 1.0, 0.0, 0.0, 1.507000000000005, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.9289999999999736, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5699999999999932, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.701999999999998, 0.0]]

In [11]:
# Create new dictionary

for seq_index in new_dic['sequences']:
    b, e, l = impute_breaks(
        new_dic['sequences'][seq_index]['raw_begin'],
        new_dic['sequences'][seq_index]['raw_end'],
        new_dic['sequences'][seq_index]['raw_sequence']
    )

    new_dic['sequences'][seq_index]['begin'] = [bb for bb in b]
    new_dic['sequences'][seq_index]['end'] = [ee for ee in e]
    new_dic['sequences'][seq_index]['break_sequence'] = [ll for ll in l]

In [12]:
def scale_sequence(sequence):
    scaler = MinMaxScaler()
    scaler.fit(
        [sequence[i][4:] for i in range(len(sequence))]
    )
    scaled = scaler.transform(
        [sequence[i][4:] for i in range(len(sequence))]
    )

    scaled_sequence = [
        [st for st in sequence[i][:4]] + [sc for sc in scaled[i]] for i in range(len(sequence))
    ]
    assert len(scaled_sequence) == len(sequence)
    for scs in scaled_sequence:
        assert len(scs) == 10
    return scaled_sequence

In [13]:
# Create new dictionary
for seq_index in new_dic['sequences']:
    scaled_seq = scale_sequence(
        new_dic['sequences'][seq_index]['break_sequence']
    )
    new_dic['sequences'][seq_index]['sequence'] = [scs for scs in scaled_seq]

In [14]:
for lid in new_dic['sequences']:
    for ts in range(len(new_dic['sequences'][lid]['sequence'])):
        if np.sum(new_dic['sequences'][lid]['sequence'][ts]) < 1:
            print('student {}, timestep {}, \n    {}\n    {}'.format(
                lid, ts, new_dic['sequences'][lid]['sequence'][ts],
                lid, ts, new_dic['sequences'][lid]['sequence'][ts-1]
            ))

In [15]:
new_dic['sequences'][51]['sequence'][150:155]

[[0.0, 0.0, 0.0, 1.0, 0.003124218945260839, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10440966212657436],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1184814508239899],
 [0.0, 0.0, 0.0, 1.0, 0.003124218945260839, 0.0, 0.0, 0.0, 0.0, 0.0]]

# Labels

In [16]:
label_map = {
      '3012': 0,
      '2130': 0,
      '0231': 1,
      '0213': 0,
      '3210': 1,
      '3120': 1,
      '2013': 1,
      '2031': 1,
      '1302': 0,
      '0123': 0,
      '1320': 0,
      '2103': 0,
      '0132': 0,
      '1023': 0,
      '1032': 0,
      '3201': 0,
      '2310': 0,
      '2301': 0,
      '0312': 0,
      '3102': 0,
      '0321': 0,
      '1203': 0,
      '1230': 0,
      '3021': 0
}
# Create new dictionary
for seq_index in new_dic['sequences']:
    new_dic['sequences'][seq_index]['label'] = label_map[new_dic['sequences'][seq_index]['permutation']]

In [17]:
new_dic['available_demographics'] = [
    'gender', 'year', 'language', 'label'
]

# Save Sequence

In [18]:
with open('../../data/ml4science_data.pkl', 'wb') as fp:
    pickle.dump(new_dic, fp)

In [19]:
for i in range(len(new_dic['sequences'])):
    for j in range(len(new_dic['sequences'][i]['sequence'])):
        non_zero_count = np.count_nonzero(new_dic['sequences'][i]['sequence'][j])
        if non_zero_count != 2:
            print(f"Vector at student {i}, sequence{j} has {non_zero_count} nonzero values")
            non_zero_positions = np.nonzero(new_dic['sequences'][i]['sequence'][j])[0]
            print(f"Positions of non-zero values: {non_zero_positions}")

In [19]:
new_dic['sequences'][0].keys()

dict_keys(['path', 'length', 'learner_id', 'raw_sequence', 'raw_begin', 'raw_end', 'last_timestamp', 'permutation', 'gender', 'year', 'language', 'begin', 'end', 'break_sequence', 'sequence', 'label'])

In [20]:
sizes = []
for i in range(len(new_dic['sequences'])):
    sizes.append(len(new_dic['sequences'][i]['sequence']))

In [21]:
max(sizes)

819

# Demo

In [22]:
with open('./ml4science_data.pkl', 'rb') as fp:
    new_dic = pickle.load(fp)

In [23]:
new_dic.keys()

dict_keys(['sequences', 'index', 'available_demographics'])

In [24]:
new_dic['available_demographics']

['gender', 'year', 'language', 'label']

In [25]:
new_dic['index']

{'learner_idx': {'svdphyjs': 0,
  '2ae6q3hw': 1,
  '8jp62suc': 2,
  'e93na59g': 3,
  'chm4sr6j': 4,
  'gc663sap': 5,
  '8nh4zvcp': 6,
  '7t9zwtmr': 7,
  'wvxkvhne': 8,
  '43e33t3h': 9,
  'a3vdjxy9': 10,
  'mkw5afyy': 11,
  'dq4zzkyt': 12,
  '5f4q4ng5': 13,
  'temu2736': 14,
  'f54jkrsw': 15,
  'ujpk3gf4': 16,
  'jkbx6axr': 17,
  'vwfpuqaz': 18,
  'w7asnymz': 19,
  '2hr6mkdc': 20,
  'nrxpa2ac': 21,
  '5zrt4f8z': 22,
  'm3d89p29': 23,
  'vjr7tshm': 24,
  'egg8756y': 25,
  'uhbpoog9': 26,
  'bg7x3db2': 27,
  'xz56yha8': 28,
  'rrykpeqz': 29,
  '55yavcue': 30,
  'p44vw7td': 31,
  'fu6nsdhs': 32,
  '2ejxq2u8': 33,
  'ky8kttpv': 34,
  'rdbqasqa': 35,
  'zgyc948n': 36,
  'c9dj36r7': 37,
  'xvsgn53d': 38,
  'qsd9cb5e': 39,
  '9pjrsbth': 40,
  'cp7mfn24': 41,
  'j9qgkaej': 42,
  '982cf4dn': 43,
  '4zsmj6s3': 44,
  '85pdk9mq': 45,
  'hf8uz5t3': 46,
  'k7p5eryf': 47,
  'x844md8u': 48,
  'baqyfz2h': 49,
  'jwntt96k': 50,
  'wktpsvp8': 51,
  'h75kp27p': 52,
  '26z3wbqz': 53,
  'x5sm9pfu': 54,
  'mn

In [26]:
new_dic['sequences'].keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,

In [27]:
new_dic['sequences'][0].keys()

dict_keys(['path', 'length', 'learner_id', 'raw_sequence', 'raw_begin', 'raw_end', 'last_timestamp', 'permutation', 'gender', 'year', 'language', 'begin', 'end', 'break_sequence', 'sequence', 'label'])

In [28]:
for i in range(len(new_dic['sequences'][0]['begin'])):
    print(
        'begin {} - end {}: {}'.format
        (new_dic['sequences'][0]['begin'][i],
        new_dic['sequences'][0]['end'][i],
        new_dic['sequences'][0]['sequence'][i])
    )

begin 0 - end 7.889: [0, 0, 0, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3440621047581669]
begin 7.889 - end 83.03: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.9999999999999999, 0.0]
begin 83.03 - end 86.28: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14174189890531658]
begin 86.28 - end 86.33: [0.0, 0.0, 0.0, 1.0, 0.006079027355622759, 0.0, 0.0, 0.0, 0.0, 0.0]
begin 87.278 - end 90.562: [0.0, 0.0, 0.0, 1.0, 0.3992705167173245, 0.0, 0.0, 0.0, 0.0, 0.0]
begin 90.562 - end 99.597: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3940424789567799]
begin 99.597 - end 100.32: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1239499399965717, 0.0, 0.0, 0.0]
begin 101.267 - end 107.071: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0772414527355239, 0.0]
begin 108.989 - end 114.653: [0.0, 0.0, 0.0, 1.0, 0.6886322188449855, 0.0, 0.0, 0.0, 0.0, 0.0]
begin 115.424 - end 115.935: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.08760500600034259, 0.0, 0.0, 0.0]
begin 117.488 - end 135.421: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.23865798

In [48]:
for i in range(len(new_dic['sequences'])):
    for j in range(len(new_dic['sequences'][i]['sequence'])):
        non_zero_count = np.count_nonzero(new_dic['sequences'][i]['sequence'][j])
        if non_zero_count != 2:
            print(f"Vector at student {i}, sequence{j} has {non_zero_count} nonzero values")
            non_zero_positions = np.nonzero(new_dic['sequences'][i]['sequence'][j])[0]
            print(f"Positions of non-zero values: {non_zero_positions}")