In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, TimeDistributed, Masking, StringLookup
from keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn.functional as F
import math
from tqdm import tqdm
import keras

In [2]:
df = pd.read_csv('../Data/non_skill_builder_data_new.csv') # fill with data file
data = df.sample(frac=0.05, random_state=69)

In [3]:
data = data[['user_id', 'problem_id', 'skill_id', 'correct', 'order_id']]
data = data.sort_values(by=['user_id', 'order_id'])
data = data.fillna(0)  # Fill missing values with 0 for now

In [4]:
un = data['problem_id'].astype(str).unique()
zer = un + '+0'
on = un + '+1'

In [5]:
vocab = np.concatenate([zer,on])

In [6]:
vocab

In [7]:
data['prob_id_x_correct'] = data['problem_id'].astype(str).copy()

In [8]:
data['prob_id_x_correct']

In [9]:
data.reset_index(inplace=True)

In [10]:
data.loc[data['correct'] == 0, 'prob_id_x_correct'] += '+0'
data.loc[data['correct'] == 1, 'prob_id_x_correct'] += '+1'

In [11]:
vocab_size = len(vocab) + 2  # Number of unique problem IDs
max_seq_len = 10  # Set maximum sequence length
numDim = math.ceil(math.log(vocab_size)) # Number of dimensions for embedding layer

labelEnc = StringLookup(vocabulary=vocab, output_mode='int', mask_token='na')
data['encoded_problem_id'] = labelEnc(data['prob_id_x_correct'])

grouped = data.groupby('user_id')

seq = []
lab = []


for user, group in tqdm(grouped):
    group = group.sort_values(by='order_id')
    feature_seq = group['encoded_problem_id'].to_numpy()
    correct_seq = group['correct'].to_numpy()
    
    

    for start_idx in range(0, len(feature_seq), max_seq_len):
        end_idx = min(start_idx + max_seq_len, len(feature_seq))

        # Get subsequence for this user
        sub_feature_seq = feature_seq[start_idx:end_idx]
        sub_correct_seq = correct_seq[start_idx:end_idx]

        # Pad feature sequence to max_seq_len
        padded_feature_seq = F.pad(torch.tensor(sub_feature_seq, dtype=torch.float32),
                                   (0, max_seq_len - len(sub_feature_seq)),
                                   value=0)
        seq.append(padded_feature_seq)

        # Pad label sequence with shape [timesteps, vocab_size]
        blank_labels = np.full((max_seq_len, vocab_size), -1, dtype=np.float32)
        blank_labels[:len(sub_feature_seq), sub_feature_seq] = sub_correct_seq

        lab.append(torch.tensor(blank_labels, dtype=torch.float32))

# Convert seq and lab to tensors
seq = torch.stack(seq)
lab = torch.stack(lab)

In [12]:
data.shape

In [13]:
print(seq.shape)
print(lab.shape)
print(vocab_size)
print(numDim)

In [14]:
# Split data into training and testing
X_train, X_test = train_test_split(seq, test_size=0.2, random_state=200)
y_train, y_test = train_test_split(lab, test_size=0.2, random_state=200)

In [15]:
print(X_train.shape)
print(y_train.shape)

In [16]:
y_train.shape

In [17]:
def custom_loss(y_true, y_pred):
   indices = tf.math.not_equal(y_true, -1)
   y_true_rel =y_true[indices]
   y_pred_rel = y_pred[indices]
   return tf.keras.losses.binary_crossentropy(y_true_rel, y_pred_rel)

In [18]:
# Model Instantiation
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=numDim, input_length=None, mask_zero=True))
model.add(Dropout(0.2))
model.add(LSTM(124, activation='tanh', return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='sigmoid')))

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
              loss=custom_loss,
              metrics=['accuracy', 'AUC'])

In [23]:
y_traina

In [21]:
# Training
model.fit(x=X_train, y=y_train, 
         epochs=10, 
         batch_size=32)