In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Masking
from sklearn.model_selection import train_test_split
import torch

In [11]:
df = pd.read_csv('23-24-problem_logs.csv') # fill with data file
#'23-24-problem_logs.csv'

In [12]:
# Data Cleaning
df[['skill_id', 'old_problem_id']] = df[['skill_id', 'old_problem_id']].apply(pd.to_numeric, errors='coerce')
data_cleaned = df.fillna(0)  # Fill missing values with 0 for now

# One-hot encode 'skill_id' and 'old_problem_id'
skill_ohe = pd.get_dummies(data_cleaned['skill_id'], dtype=int, prefix='skill')
problem_ohe = pd.get_dummies(data_cleaned['old_problem_id'], dtype=int, prefix='problem')

# Combine the encoded columns back into the DataFrame
data_cleaned = pd.concat([data_cleaned, skill_ohe, problem_ohe], axis=1)

# Remove the original 'skill_id' and 'old_problem_id' columns
data_cleaned.drop(['skill_id', 'old_problem_id'], axis=1, inplace=True)

# Group by 'user_xid'
grouped = data_cleaned.groupby('user_xid')

MemoryError: Unable to allocate 213. GiB for an array with shape (54437, 1048575) and data type int32

In [10]:
# data_cleaned.head()
print(data_cleaned.shape)

(511, 571)


In [14]:
seq = []
lab = []
for user, group in grouped:
    group = group.sort_values(by='start_time')

    skill_feature_seq = group[skill_ohe.columns].to_numpy()  # One-hot encoded skill features
    problem_feature_seq = group[problem_ohe.columns].to_numpy()  # One-hot encoded problem features

    # Create a combined list of arrays: [[skill_ohe], [problem_ohe]]
    feature_seq = [zip(skill_feature_seq, problem_feature_seq)]

    seq.append(torch.tensor(feature_seq, dtype=torch.float32))
    
    # Ensure labels are treated as a tensor, even if they are single values
    labels = torch.tensor(group['discrete_score'].to_numpy(), dtype=torch.float32)
    lab.append(labels)

# Numeber of attempts vary, so padding is required, padding with 0
padded_seq = pad_sequences([s.numpy() for s in seq], padding='post', dtype='float32')
padded_lab = pad_sequences([l.numpy() for l in lab], padding='post', dtype='float32') 

# Reshape labels to match output format (1 for each timestep)
padded_lab = padded_lab.reshape(padded_lab.shape[0], padded_lab.shape[1], 1)  # Ensure 3D shape (batch_size, timesteps, 1)


# Masking
# Assuming `skill_id` is at index 0 and `problem_id` is at index 1 in the feature vector
mask = np.logical_not((padded_seq[:, :, 0] == 0) & (padded_seq[:, :, 1] == 0))  # True where both are 0

# Apply the mask to the features and labels
masked_padded_seq = padded_seq * mask[:, :, None]  # Apply mask to all features
masked_padded_lab = padded_lab * mask[:, :, None]  # Apply mask to labels as well


ValueError: expected sequence of length 173 at dim 2 (got 391)

In [36]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(padded_seq, padded_lab, test_size=0.2, random_state=200)

In [None]:
# Model Instantiation
model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(None, 2)))  # Mask rows where all features are 0 (after masking)
model.add(LSTM(64, input_shape=(None, 2), activation='tanh', return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

model.summary()

In [None]:
# Training
model.fit(X_train, y_train, 
         validation_data=(X_test, y_test),
         epochs=10, 
         batch_size=32)

In [None]:
# Evaluate Model on Test Data
model.evaluate(X_test, y_test)

In [None]:
# Predict on Test Data
model.predict(X_test)