In [25]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Masking
from sklearn.model_selection import train_test_split
import torch

In [26]:
df = pd.read_csv('samples/23-24/sample1.csv') # fill with data file

In [27]:
# Data Cleaning
df[['skill_id', 'old_problem_id']] = df[['skill_id', 'old_problem_id']].apply(pd.to_numeric, errors='coerce')

data_cleaned = df.fillna(0)  # Fill missing values with 0 for now
grouped = data_cleaned.groupby('user_xid')

seq = []
lab = []
for user, group in grouped:
    group = group.sort_values(by='start_time')
    feature_seq = group[['skill_id', 'old_problem_id']].values
    feature_seq[:, -1] = feature_seq[:, -1].astype(int)  # Conversion of saw_answer to int
    seq.append(torch.tensor(feature_seq, dtype=torch.float32))
    
    # Ensure labels are treated as a tensor, even if they are single values
    labels = torch.tensor(group['discrete_score'].values, dtype=torch.float32)
    lab.append(labels)

# Numeber of attempts vary, so padding is required, padding with 0
padded_seq = pad_sequences([s.numpy() for s in seq], padding='post', dtype='float32')
padded_lab = pad_sequences([l.numpy() for l in lab], padding='post', dtype='float32') 

# Reshape labels to match output format (1 for each timestep)
padded_lab = padded_lab.reshape(padded_lab.shape[0], padded_lab.shape[1], 1)  # Ensure 3D shape (batch_size, timesteps, 1)


# Masking
# Assuming `skill_id` is at index 0 and `problem_id` is at index 1 in the feature vector
mask = np.logical_not((padded_seq[:, :, 0] == 0) & (padded_seq[:, :, 1] == 0))  # True where both are 0

# Apply the mask to the features and labels
masked_padded_seq = padded_seq * mask[:, :, None]  # Apply mask to all features
masked_padded_lab = padded_lab * mask[:, :, None]  # Apply mask to labels as well


In [28]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(padded_seq, padded_lab, test_size=0.2, random_state=200)

In [29]:
# Model Instantiation
model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(None, 2)))  # Mask rows where all features are 0 (after masking)
model.add(LSTM(64, input_shape=(None, 2), activation='tanh', return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

model.summary()

  super().__init__(**kwargs)
  super().__init__(**kwargs)


In [32]:
# Training
model.fit(X_train, y_train, 
         validation_data=(X_test, y_test),
         epochs=10, 
         batch_size=32)

In [31]:
# Evaluate Model on Test Data
model.evaluate(X_test, y_test)

[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step - AUC: 0.5235 - accuracy: 0.0134 - loss: 0.6427


[0.6455414891242981, 0.013351756148040295, 0.5215486288070679]

In [22]:
# Predict on Test Data
model.predict(X_test)

[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step


array([[[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.67334247],
        [0.67334247],
        [0.67334247]],

       [[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.6742352 ],
        [0.6742352 ],
        [0.6742352 ]],

       [[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.67334247],
        [0.67334247],
        [0.67334247]],

       ...,

       [[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.67334247],
        [0.67334247],
        [0.67334247]],

       [[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.6742352 ],
        [0.6742352 ],
        [0.6742352 ]],

       [[0.6298417 ],
        [0.6677111 ],
        [0.67334247],
        ...,
        [0.6742355 ],
        [0.6742355 ],
        [0.6742355 ]]], dtype=float32)