In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [3]:
# Sample data - UserIDs, course sequences and scores
import collections

def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = float(len(dictionary) + 1)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary


courses_data = pd.read_csv("csv/data.csv")
vocab_size = len(courses_data) + 1

courses_dict, reverse_course_dict = (build_dataset(courses_data["course_id"]))

In [4]:
prep_courses_df = pd.read_csv("warehouse/prep/courses.csv")
prep_user_courses_df = pd.read_csv("warehouse/prep/user_courses.csv")
prep_users_df = pd.read_csv("warehouse/prep/users.csv")

merged_df = pd.merge(prep_users_df, prep_user_courses_df, left_on='UserID', right_on='user_id', how='inner')

# Step 2: Merge the result with courses on course_id
final_merged_df = pd.merge(merged_df, prep_courses_df, left_on='course_id', right_on='course_id', how='inner')
final_merged_df = final_merged_df.drop("user_id", axis=1)

final_merged_df['enrolled_at'] = pd.to_datetime(final_merged_df['enrolled_at'])

data = final_merged_df.sort_values(by='enrolled_at').groupby('UserID').agg({
    'course_id': list,
    'score': list
}).reset_index()

In [5]:
X = []
y = []

data['course_id'] = data['course_id'].apply(lambda x: [courses_dict[course] for course in x])
for courses in data['course_id']:
    for i in range(1, len(courses)):
        X.append(courses[:i])       # All courses up to i
        y.append(courses[i])        # Next course (course_id)

# Pad sequences to ensure uniform input shape
max_seq_length = max([len(seq) for seq in X])
X_padded = pad_sequences(X, maxlen=max_seq_length)

X_padded = np.array(X_padded)
y_encoded = to_categorical(y)

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2)

In [7]:
# # Define the model
embedding_dim = 8  # Embedding dimension

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    LSTM(64),
    Dense(vocab_size, activation='softmax')  # Output probabilities for course IDs
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=2, validation_data=(X_test, y_test))


Epoch 1/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.0150 - loss: 4.3936 - val_accuracy: 0.0000e+00 - val_loss: 4.3977
Epoch 2/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0814 - loss: 4.3735 - val_accuracy: 0.0000e+00 - val_loss: 4.4061
Epoch 3/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0347 - loss: 4.3248 - val_accuracy: 0.0000e+00 - val_loss: 4.4515
Epoch 4/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0168 - loss: 4.2362 - val_accuracy: 0.0000e+00 - val_loss: 4.6402
Epoch 5/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0309 - loss: 4.1487 - val_accuracy: 0.0000e+00 - val_loss: 4.9467
Epoch 6/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0186 - loss: 4.0577 - val_accuracy: 0.0000e+00 - val_loss: 5.1616
Epoch 7/20
[1m

In [9]:
new_data = np.array([[12, 2]])  # Example sequence (same shape as training data)

# Predict probabilities for each course ID
predictions = model.predict(new_data)
sorted = np.argsort(predictions)

res = []
for i in range(1, 4):
    res.append(sorted[0][i * -1])
print(res)
print(sorted)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 479ms/step
[18, 14, 9]
[[65 61 75 56 80 68 32 69 41 10  0 64 48  2 57 42 33 39 50 26 51 70 21 31
  35 43 59 36 54 23 15 60 25 49 38 44 67 27 55 29 30 34 40 13 28  8 22 79
  77 53  7 12 37 24 46 17 73 66 16  6 72 20 45  3 78 47 63 58 19 71 52  5
  62  1 74  4 76 11  9 14 18]]
