In [105]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.nn.utils.rnn import pad_sequence


In [106]:
df = pd.read_csv('23-24-problem_logs.csv') # fill with data file

In [136]:
# Data Cleaning

df = df[['user_xid', 'old_problem_id', 'skill_id', 'discrete_score', 'start_time']]
df = df.sort_values(by=['user_xid', 'start_time'])
df = df.fillna(0)  # Fill missing values with 0 for now

le = LabelEncoder()
df['encoded_problem_id'] = le.fit_transform(df['old_problem_id']) + 1 # Shift by 1 to reserve 0 for padding


In [137]:
# Data Preprocessing
grouped = df.groupby('user_xid')

seq = []
lab = []
for user, group in grouped:
    group = group.sort_values(by='start_time')
    feature_seq = group['encoded_problem_id'].to_numpy()
    seq.append(torch.tensor(feature_seq, dtype=torch.float32))
    
    # Ensure labels are treated as a tensor
    labels = torch.tensor(group['discrete_score'].to_numpy(), dtype=torch.float32)
    lab.append(labels)

# Padding sequences with zeros using PyTorch's pad_sequence
padded_seq = pad_sequence(seq, batch_first=True, padding_value=0)  # (batch_size, timesteps)
padded_lab = pad_sequence(lab, batch_first=True, padding_value=0.0)  # (batch_size, timesteps)

# Reshape labels to have 3D shape (batch_size, timesteps, 1)
padded_lab = padded_lab.unsqueeze(-1)

In [138]:
print(padded_seq.shape)
print(padded_lab.shape)

torch.Size([50689, 1888])
torch.Size([50689, 1888, 1])


In [139]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(padded_seq, padded_lab, test_size=0.2, random_state=200)

In [140]:
VOCAB_SIZE = df['old_problem_id'].nunique()
print(VOCAB_SIZE)

54437


In [141]:
# Model Instantiation
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=512, mask_zero=True, input_length=None))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

model.summary()

In [142]:
# Training
model.fit(X_train, y_train, 
         validation_data=(X_train, y_train),
         epochs=10, 
         batch_size=32)

Epoch 1/10
[1m 166/1268[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:37:43[0m 5s/step - AUC: 0.6244 - accuracy: 0.0573 - loss: 0.6664

In [93]:
# Evaluate Model on Test Data
model.evaluate(X_test, y_test)

[1m317/317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 445ms/step - AUC: 0.5895 - accuracy: 0.3931 - loss: 1.2926


[1.2533890008926392, 0.39679819345474243, 0.5959181785583496]

In [None]:
# Predict on Test Data
model.predict(X_test)