In [9]:
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sc
import tensorflow as tf
# In this demo, we use a lot of SciKit-Learn functions, as imported below.
from sklearn import feature_extraction, model_selection
from sklearn.metrics import mean_squared_error, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.preprocessing import MinMaxScaler


In [29]:
activity = pd.read_csv('data/features/activity.csv')
performances = pd.read_csv('data/features/performances.csv')

In [30]:
performances_math = performances[performances['domain']== 'math'].copy()
activity_math = activity[activity['domain']== 'math'].copy()

In [31]:
print(performances_math.head())

    user_id domain test_id  course        date                 time  \
9         6   math      42    3865  2024-11-23  2024-11-23 10:25:34   
10        6   math      48    3865  2025-01-08  2025-01-08 14:48:04   
11        6   math      49    3865  2025-01-08  2025-01-08 15:29:07   
12        6   math      50    3865  2025-02-04  2025-02-04 15:36:38   
13        6   math      54    3865  2024-11-23  2024-11-23 11:26:10   

    percentage  performance  
9        25.00       -36.04  
10       50.00        -1.92  
11       66.67        21.23  
12       54.55        19.57  
13       14.29       -47.71  


In [32]:
print(activity_math.head())

   activity_id  user_id  post_id  course_id activity_type  activity_status  \
0         1128     2533       42         42        course                0   
1         1129     2533       55         42        lesson                0   
2         1130     2533       98         42         topic                1   
3         1131     2533      100         42         topic                1   
4         1132     2533      102         42         topic                1   

      activity_started   activity_completed     activity_updated domain  \
0  2023-04-07 16:42:35  2023-04-07 17:35:15  2023-04-07 17:35:15   math   
1  2023-04-07 16:42:35                  NaN  2023-04-07 16:42:35   math   
2  2023-04-07 16:42:38  2023-04-07 16:43:58  2023-04-07 16:43:58   math   
3  2023-04-07 16:43:59  2023-04-07 16:46:13  2023-04-07 16:46:13   math   
4  2023-04-07 16:46:14  2023-04-07 16:46:27  2023-04-07 16:46:27   math   

   date_restored  times_valid        date       time_spent  time_in_minutes  \
0

In [41]:
import pandas as pd
import numpy as np

# 1) Parse timestamps & extract calendar day
activity_math['activity_updated'] = pd.to_datetime(activity_math['activity_updated'])
activity_math['day'] = activity_math['activity_updated'].dt.normalize()

performances_math['exam_time'] = pd.to_datetime(performances_math['time'])
performances_math['day'] = performances_math['exam_time'].dt.normalize()

# 2) Build the set of all active days per user and assign a relative day index
days = (
    pd.concat([
        activity_math[['user_id','day']],
        performances_math[['user_id','day']]
    ])
    .drop_duplicates()
    .sort_values(['user_id','day'])
)
days['day_idx'] = days.groupby('user_id').cumcount()
n_days = days['day_idx'].max() + 1

# 3) Build df_y_filled: exam percentages pivoted on day_idx
perf = (
    performances_math[['user_id','day','performance']]
    .merge(days[['user_id','day','day_idx']], on=['user_id','day'], how='left')
)
df_y = perf.groupby(['user_id','day_idx'])['performance'].mean().unstack(fill_value=np.nan)
# ensure columns 0..n_days-1
df_y = df_y.reindex(columns=range(n_days))
df_y_filled = df_y.fillna(-1)

# 4) Build df_x: daily activity features pivoted on day_idx
daily_feats = (
    activity_math
    .groupby(['user_id','day'])
    .agg(
        total_time     = ('time_in_minutes','sum'),
        activity_count = ('activity_id','count'),
        diversity      = ('activity_type','nunique')
    )
    .reset_index()
)
daily = (
    days[['user_id','day','day_idx']]
    .merge(daily_feats, on=['user_id','day'], how='left')
    .fillna({'total_time':0,'activity_count':0,'diversity':0})
)

# pivot
feat_piv = daily.pivot(index='user_id',
                       columns='day_idx',
                       values=['total_time','activity_count','diversity'])
# flatten and ensure full days
feat_cols = []
for feat in ['total_time','activity_count','diversity']:
    for d in range(n_days):
        feat_cols.append(f"{feat}_{d}")
feat_piv.columns = feat_cols
df_x = feat_piv.reindex(columns=feat_cols, fill_value=0).reset_index()

# 5) Align df_x to only exam-taking users in df_y_filled, in same order
exam_users = df_y_filled.index
df_x_aligned = df_x.set_index('user_id').loc[exam_users].reset_index()

# 6) Build mask for missing exams and apply to df_x_aligned
n_users = len(exam_users)
n_features = 3

mask = (df_y_filled.values == -1)                       # shape (n_users, n_days)
mask_feat = np.repeat(mask, n_features, axis=1)         # (n_users, n_days*n_features)

X_flat = df_x_aligned.drop('user_id', axis=1).values.copy()  # (n_users, n_days*n_features)
X_flat[mask_feat] = -1
df_x_masked = pd.DataFrame(X_flat, index=exam_users, columns=feat_cols)

# 7) Reshape into arrays
y_array = df_y_filled.values                               # (n_users, n_days)
X_array = df_x_masked.values.reshape(n_users, n_days, n_features)

# 8) Verify
print("X_array.shape =", X_array.shape)  # → (num_users, num_days, 3)
print("y_array.shape =", y_array.shape)  # → (num_users, num_days)


X_array.shape = (469, 40, 3)
y_array.shape = (469, 40)


In [42]:
df_x_aligned

Unnamed: 0,user_id,total_time_0,total_time_1,total_time_2,total_time_3,total_time_4,total_time_5,total_time_6,total_time_7,total_time_8,...,diversity_30,diversity_31,diversity_32,diversity_33,diversity_34,diversity_35,diversity_36,diversity_37,diversity_38,diversity_39
0,6,4.416667,0.000000,38.600000,256.516667,0.000000,0.000000,0.000000,98.716667,0.000000,...,,,,,,,,,,
1,7,92.866667,13.883333,0.000000,5.800000,0.000000,30.000000,30.000000,2.133333,16.966667,...,,,,,,,,,,
2,9,0.000000,36.116667,15.033333,1.983333,21.466667,37.083333,34.833333,30.366667,,...,,,,,,,,,,
3,11,0.000000,115.550000,133.783333,0.866667,7.200000,9.066667,11.800000,23.966667,0.083333,...,,,,,,,,,,
4,16,97.450000,108.450000,1.833333,0.766667,0.733333,6.883333,28.750000,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,4085,39.000000,8.433333,0.083333,81.900000,37.083333,18.733333,,,,...,,,,,,,,,,
465,4086,52.633333,63.500000,90.000000,68.533333,65.566667,14.066667,50.933333,37.450000,0.000000,...,1.0,,,,,,,,,
466,4087,0.000000,34.416667,12.950000,0.000000,34.416667,260.700000,21.916667,28.450000,65.216667,...,,,,,,,,,,
467,4092,31.516667,0.683333,0.883333,0.000000,0.000000,0.000000,2.100000,1.900000,117.783333,...,,,,,,,,,,


In [43]:
df_y_filled

day_idx,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-1.00,-1.000,-1.00,-41.875000,-1.000000,-1.000000,-1.000,-5.730000,-1.000,9.6550,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,-12.78,-1.000,-1.00,-1.000000,-1.000000,-1.000000,-1.000,-1.000000,-1.000,-1.0000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
9,-1.00,-1.000,-1.00,-1.000000,-1.000000,11.390000,9.680,-10.015000,-1.000,-1.0000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
11,-1.00,-1.000,0.03,-1.000000,6.613333,-16.113333,-10.795,-6.023333,-1.000,-11.2700,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
16,-1.00,-58.675,-1.00,-1.000000,-1.000000,-51.920000,18.070,-1.000000,-1.000,-1.0000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4085,-1.00,-1.000,-1.00,-9.086667,-1.000000,-13.770000,-1.000,-1.000000,-1.000,-1.0000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4086,-1.00,-1.000,-1.00,12.780000,-45.820000,-1.000000,-1.000,35.580000,10.120,28.6600,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4087,-1.00,-1.000,-1.00,-1.000000,19.030000,-45.820000,11.040,10.470000,-1.000,-1.0000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4092,-1.00,-36.810,-48.65,-1.000000,-1.000000,-1.000000,-1.000,-1.000000,-19.515,-3.5060,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
