In [41]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.nn.utils.rnn import pad_sequence
import math
from tqdm import tqdm

In [42]:
data = pd.read_csv('test_2.csv') # fill with data file

In [None]:
import torch.nn.functional as F

vocab_size = data['problem_id'].nunique() + 1  # Number of unique problem IDs
max_seq_len = 500  # Set maximum sequence length

data = data[['user_id', 'problem_id', 'skill_id', 'correct', 'order_id']]
data = data.sort_values(by=['user_id', 'order_id'])
data = data.fillna(0)  # Fill missing values with 0 for now

le = LabelEncoder()
data['encoded_problem_id'] = le.fit_transform(data['problem_id'])

grouped = data.groupby('user_id')

seq = []
lab = []

for user, group in tqdm(grouped):
    group = group.sort_values(by='order_id')
    feature_seq = group['encoded_problem_id'].to_numpy()
    correct_seq = group['correct'].to_numpy()

    for start_idx in range(0, len(feature_seq), max_seq_len):
        end_idx = min(start_idx + max_seq_len, len(feature_seq))

        # Get subsequence for this user
        sub_feature_seq = feature_seq[start_idx:end_idx]
        sub_correct_seq = correct_seq[start_idx:end_idx]

        # Pad feature sequence to max_seq_len
        padded_feature_seq = F.pad(torch.tensor(sub_feature_seq, dtype=torch.float32),
                                   (0, max_seq_len - len(sub_feature_seq)),
                                   value=-1)
        seq.append(padded_feature_seq)

        # Pad label sequence with shape [timesteps, vocab_size]
        blank_labels = np.full((max_seq_len, vocab_size), -1, dtype=np.float32)
        blank_labels[:len(sub_feature_seq), sub_feature_seq] = sub_correct_seq

        lab.append(torch.tensor(blank_labels, dtype=torch.float32))

# Convert seq and lab to tensors
seq = torch.stack(seq)
lab = torch.stack(lab)

In [None]:
print(seq.shape)
print(lab.shape)


In [None]:
vocab_size = df['old_problem_id'].nunique() + 1 # Number of unique problem IDs
print(vocab_size)

df = df[['user_xid', 'old_problem_id', 'skill_id', 'discrete_score', 'start_time']]
df = df.sort_values(by=['user_xid', 'start_time'])
df = df.fillna(0)  # Fill missing values with 0 for now

le = LabelEncoder()
df['encoded_problem_id'] = le.fit_transform(df['old_problem_id']) # Shift by 1 to reserve 0 for padding

# Data Preprocessing
grouped = df.groupby('user_xid')

seq = []
lab = []
for user, group in grouped:
    group = group.sort_values(by='start_time')
    feature_seq = group['encoded_problem_id'].to_numpy()
    seq.append(torch.tensor(feature_seq, dtype=torch.float32))
    
    blank_labels = np.full((group.shape[1], vocab_size), -1)
    print(blank_labels)

    # Ensure labels are treated as a tensor
    labels = torch.tensor(group['discrete_score'].to_numpy(), dtype=torch.float32)
    lab.append(labels)

# Padding sequences with zeros using PyTorch's pad_sequence
padded_seq = pad_sequence(seq, batch_first=True, padding_value=0)  # (batch_size, timesteps)
padded_lab = pad_sequence(lab, batch_first=True, padding_value=0.0)  # (batch_size, timesteps)

# Reshape labels to have 3D shape (batch_size, timesteps, 1)
padded_lab = padded_lab.unsqueeze(-1)

In [None]:
print(padded_seq.shape)
print(padded_lab.shape)
y = np.squeeze(padded_lab)
print(y)

In [62]:
# Split data into training and testing
X_train, X_test = train_test_split(seq, test_size=0.2, random_state=200)
y_train, y_test = train_test_split(lab, test_size=0.2, random_state=200)

In [None]:
VOCAB_SIZE = df['old_problem_id'].nunique() + 1
print(VOCAB_SIZE)

In [67]:
# Model Instantiation
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=numDim, mask_zero=True, input_length=None))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

In [None]:
# Training
model.fit(X_train, y_train, 
         validation_data=(X_train, y_train),
         epochs=1, 
         batch_size=32)

In [None]:
# Evaluate Model on Test Data
model.evaluate(X_test, y_test)

In [None]:
# Predict on Test Data
model.predict(X_test)