<a href="https://colab.research.google.com/github/CrrazyPeach/GitColab/blob/main/DKT_ass2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    '/content/drive/MyDrive/assignment2017.csv',
    usecols=['studentId', 'MiddleSchoolId','problemId', 'assignmentId', 'skill', 'correct','timeTaken','AveCorrect','hintCount','hintTotal','bottomHint','attemptCount','AveCorrect'
        ,'frIsHelpRequest','frPast5HelpRequest','frPast8HelpRequest','totalFrPercentPastWrong','timeGreater10SecAndNextActionRight']
).dropna(subset=['skill'])

In [59]:
raw_question = data.skill.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }

print("number of skills: %d" % num_skill)

number of skills: 102


In [60]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.studentId == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student.sort_values('problemId')
    q = [questions[q] for q in seq.skill.tolist()]
    a = seq.correct.tolist()
    return q, a


# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data.studentId.unique())

parse student sequence:	: 100%|██████████| 1709/1709 [00:04<00:00, 426.42it/s]


In [61]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [62]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, '/content/drive/MyDrive/assignment2017train.txt')
sequences2tl(test_sequences, '/content/drive/MyDrive/assignment2017test.txt')

write into file: 100%|██████████| 1196/1196 [00:00<00:00, 1317.03it/s]
write into file: 100%|██████████| 513/513 [00:00<00:00, 1155.47it/s]


In [63]:
MAX_STEP = 50
NUM_QUESTIONS = num_skill


def encode_onehot(sequences, max_step, num_questions):
    result = []

    for q, a in tqdm.tqdm(sequences, 'convert to one-hot format: '):
        length = len(q)
        # append questions' and answers' length to an integer multiple of max_step
        mod = 0 if length % max_step == 0 else (max_step - length % max_step)
        onehot = np.zeros(shape=[length + mod, 2 * num_questions])
        for i, q_id in enumerate(q):
            index = int(q_id if a[i] > 0 else q_id + num_questions)
            onehot[i][index] = 1
        result = np.append(result, onehot)
    
    return result.reshape(-1, max_step, 2 * num_questions)


# reduce the amount of data for example running faster
percentage = 0.05
train_data = encode_onehot(train_sequences[: int(len(train_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)
test_data = encode_onehot(test_sequences[: int(len(test_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)

convert to one-hot format: 100%|██████████| 59/59 [00:01<00:00, 54.28it/s]
convert to one-hot format: 100%|██████████| 25/25 [00:00<00:00, 100.48it/s]


In [64]:
# save onehot data
np.save('/content/drive/MyDrive/assignment2017train.txt', train_data)
np.save('/content/drive/MyDrive/assignment2017test.txt', test_data)