In [2]:
import cv2

In [None]:
cv2.imshow()

In [1]:
import json
from collections import Counter
from itertools import chain
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime

import numpy as np
import pandas as pd

# Load Data

In [2]:
def load_iscream(fpath='iscream_public_edu_v3.txt'):
    
    with open(fpath, 'r') as f:
        lines = f.readlines()
        
    lines = [json.loads(line.strip().replace("'","\"")) for line in lines]
    
    df = pd.DataFrame(columns=lines[0].keys())
    for user in lines:
        df = df.append(pd.Series(user), ignore_index=True)
        
    return df

In [3]:
%%time
df = load_iscream()

Wall time: 17.9 s


# train.csv

## Convert Timestamp

In [15]:
df['Timestamp'] = df['Timestamp'].apply(lambda x: list(datetime.fromtimestamp(_) for _ in x))

## Sortby timestamp

In [6]:
def sortby_timestamp(line):
    
    mask = np.array(line['Timestamp']).argsort()
    
    line['assessmentItemID'] = [line['assessmentItemID'][i] for i in mask]
    line['testId'] = [line['testId'][i] for i in mask]
    line['answerCode'] = [line['answerCode'][i] for i in mask]
    line['Timestamp'] = [line['Timestamp'][i] for i in mask]
        
    return line

In [7]:
%%time
df.apply(sortby_timestamp, axis=1)

Wall time: 28.6 s


Unnamed: 0,assessmentItemID,testId,answerCode,Timestamp
0,"[187, 188, 189, 190, 191, 197, 198, 199, 200, ...","[37, 37, 37, 37, 37, 39, 39, 39, 39, 39, 41, 4...","[1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...","[1582836716.0, 1582839096.0, 1582839124.0, 158..."
1,"[263, 264, 265, 266, 267, 40, 41, 42, 43, 44, ...","[52, 52, 52, 52, 52, 8, 8, 8, 8, 8, 10, 10, 10...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, ...","[1580686959.0, 1580687009.0, 1580687122.0, 158..."
2,"[3269, 3270, 3271, 3272, 3273, 3365, 3366, 336...","[625, 625, 625, 625, 625, 643, 643, 643, 643, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...","[1586743520.0, 1586743522.0, 1586743529.0, 158..."
3,"[3239, 3240, 3241, 3243, 3242, 3269, 3270, 327...","[619, 619, 619, 619, 619, 625, 625, 625, 625, ...","[0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, ...","[1582326717.0, 1582326729.0, 1582326758.0, 158..."
4,"[3370, 3371, 3372, 3373, 3374, 3375, 3229, 323...","[644, 644, 644, 644, 644, 644, 617, 617, 617, ...","[1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1592511061.0, 1592511087.0, 1592511106.0, 159..."
...,...,...,...,...
7437,"[3452, 3453, 3454, 3455, 3456, 3457, 3469, 347...","[660, 660, 660, 660, 660, 660, 663, 663, 663, ...","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[1589232795.0, 1589234016.0, 1589234023.0, 158..."
7438,"[79, 80, 81, 82, 83, 84, 85, 86, 95, 96, 97, 9...","[16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 1...","[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1588633551.0, 1588633589.0, 1588633595.0, 158..."
7439,"[3116, 3117, 3118, 3119, 3120, 3121, 3376, 337...","[595, 595, 595, 595, 595, 595, 645, 645, 645, ...","[1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, ...","[1581017302.0, 1581017339.0, 1581017369.0, 158..."
7440,"[3082, 3083, 3084, 3085, 3086, 3407, 3408, 340...","[589, 589, 589, 589, 589, 651, 651, 651, 651, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, ...","[1595372548.0, 1595372576.0, 1595372666.0, 159..."


## Student Answer rate

In [8]:
def answer_rate(lst):
    
    answers = np.array(lst)
    return len(answers[answers == 1]) / len(answers)

student_answer_rate = df['answerCode'].apply(answer_rate)

In [9]:
df['studentAnswerRate'] = student_answer_rate

## Add Elapsed Time

In [16]:
def timedelta2float(td):
    
    res = td.microseconds/float(1000000) + (td.seconds + td.days * 24 * 3600)
    return res

def calculate_elapsed(lst):
    
    return [0.] + [timedelta2float(lst[i+1] - lst[i]) for i, _ in enumerate(lst[:-1])]

In [17]:
elapsed = df['Timestamp'].apply(calculate_elapsed)

In [19]:
df['Elapsed'] = elapsed

## Check test consecutive

In [20]:
def test_consecutive(lst):
    
    return [1] + [1 if lst[i] == lst[i-1] else 0 for i in range(1, len(lst))]

In [21]:
consecutive = df['testId'].apply(test_consecutive)

In [22]:
df['testConsecutive'] = consecutive

## Save

In [33]:
type(df['assessmentItemID'][0])

list

In [28]:
df.to_csv('processed_iscream.csv', index=False)

# Side information