In [158]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [167]:
activity = pd.read_csv('data/features/activity.csv')
performances = pd.read_csv('data/features/performances.csv')

In [168]:
performances_math = performances[performances['domain']== 'math'].copy()
activity_math = activity[activity['domain']== 'math'].copy()

In [169]:
activity_math.dropna(inplace=True)

In [170]:
# Rolling window for recent activity
rolling_window_days = 10

# Convert the 'date' columns to datetime
activity_math['activity_updated'] = pd.to_datetime(activity_math['activity_updated'])
performances_math['time'] = pd.to_datetime(performances_math['time'])

def compute_all_features_for_exam(exam_row, user_activities, user_exams, window_days=rolling_window_days):

    exam_dt = exam_row['time']

    # Include activities up to and including exam_date
    previous_activities = user_activities[user_activities['activity_updated'] < exam_dt].copy()

    # Rolling window (activities in the last N days, including exam day)
    window_start = exam_dt - pd.Timedelta(days=window_days)
    rolling_activities = previous_activities[previous_activities['activity_updated'] >= window_start].copy()

    features = {}

    # Recent average time per activity (rolling window)
    total_time_rolling = rolling_activities['time_in_minutes'].sum()
    count_rolling = len(rolling_activities)
    features['recent_avg_time_per_activity'] = total_time_rolling / count_rolling if count_rolling > 0 else 0

    # Number of days since last activity
    if not previous_activities.empty:
        last_activity_date = previous_activities['activity_updated'].max()
        features['days_since_last_activity'] = (exam_dt - last_activity_date).days
    else:
        features['days_since_last_activity'] = np.nan

    # Total time spent on activities before the exam
    features['total_time_spent_on_activity_before_exam'] = previous_activities['time_in_minutes'].sum() if not previous_activities.empty else 0

    # Average percentage on past exams
    previous_exams = user_exams[user_exams['time'] < exam_dt]
    features['average_percentage_past_exams'] = previous_exams['percentage'].mean() if not previous_exams.empty else np.nan

    # Usage Frequency: Average activities per day in rolling window & Active days ratio
    features['avg_activities_per_day_recent'] = count_rolling / window_days if window_days > 0 else np.nan
    if not rolling_activities.empty:
        distinct_days = rolling_activities['activity_updated'].dt.normalize().nunique()
    else:
        distinct_days = 0
    features['active_days_ratio_recent'] = distinct_days / window_days if window_days > 0 else np.nan

    # Activity diversity (rolling window)
    features['diversity_recent'] = rolling_activities['activity_type'].nunique() if not rolling_activities.empty else 0


    return pd.Series(features)

# Loop over each exam (grouped by user) in performances_math and compute all features.
features_list = []

for user_id, user_exams in performances_math.groupby('user_id'):
    # Get corresponding activities for the user from activity_math and sort by date
    user_activities = activity_math[activity_math['user_id'] == user_id].sort_values('activity_updated')
    user_exams_sorted = user_exams.sort_values('time')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_math_features = performances_math.join(features_df, how='left')

In [163]:
# do not run actually makes the model worst
'''
from sklearn.neighbors import NearestNeighbors

# 1) Define which features we’ll use to compute “similarity”:
sim_features = [
    'recent_avg_time_per_activity',
    'days_since_last_activity',
    'total_time_spent_on_activity_before_exam',
    'avg_activities_per_day_recent',
    'active_days_ratio_recent',
    'diversity_recent'
]

# 2) Prepare a column to hold the imputed values
performances_math_features['avg_pct_past_exams_imputed'] = performances_math_features['average_percentage_past_exams']

# 3) Group by test_id and run kNN inside each group
for test_id, group in performances_math_features.groupby('test_id'):
    # indices of rows we need to fill
    missing_idx = group[group['average_percentage_past_exams'].isna()].index
    if len(missing_idx) == 0:
        continue

    # candidate neighbors: same test, non‐missing avg_pct
    candidates = group[group['average_percentage_past_exams'].notna()]
    if candidates.shape[0] == 0:
        # no one else took that test (we can skip or fill global median)
        continue

    # Build a little matrix of sim_features, median‐imputed for any remaining NaNs
    feat_mat = group[sim_features].copy()
    feat_mat = feat_mat.fillna(feat_mat.median())

    # Split into X_train (candidates) and X_query (the missing rows)
    X_train = feat_mat.loc[candidates.index].values
    X_query = feat_mat.loc[missing_idx].values

    # We’ll use up to 3 neighbors (fewer if not enough candidates)
    k = min(3, X_train.shape[0])
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(X_train)
    distances, neighbors = nbrs.kneighbors(X_query)

    # For each missing row, average the **actual test scores** of its neighbors
    candidate_scores = candidates['percentage'].values  # their score on that same test
    for i, idx in enumerate(missing_idx):
        nbr_idxs = neighbors[i]                   # e.g. array([5, 12,  3])
        imputed_val = candidate_scores[nbr_idxs].mean()
        performances_math_features.at[idx, 'avg_pct_past_exams_imputed'] = imputed_val

# 4) Replace the original column (or keep both)
performances_math_features['average_percentage_past_exams'] = performances_math_features['avg_pct_past_exams_imputed']
performances_math_features.drop(columns='avg_pct_past_exams_imputed', inplace=True)
'''

"\nfrom sklearn.neighbors import NearestNeighbors\n\n# 1) Define which features we’ll use to compute “similarity”:\nsim_features = [\n    'recent_avg_time_per_activity',\n    'days_since_last_activity',\n    'total_time_spent_on_activity_before_exam',\n    'avg_activities_per_day_recent',\n    'active_days_ratio_recent',\n    'diversity_recent'\n]\n\n# 2) Prepare a column to hold the imputed values\nperformances_math_features['avg_pct_past_exams_imputed'] = performances_math_features['average_percentage_past_exams']\n\n# 3) Group by test_id and run kNN inside each group\nfor test_id, group in performances_math_features.groupby('test_id'):\n    # indices of rows we need to fill\n    missing_idx = group[group['average_percentage_past_exams'].isna()].index\n    if len(missing_idx) == 0:\n        continue\n\n    # candidate neighbors: same test, non‐missing avg_pct\n    candidates = group[group['average_percentage_past_exams'].notna()]\n    if candidates.shape[0] == 0:\n        # no one el

imputing the values for the average percantage past exams actually make the model worst --> will just drop the rows

In [171]:
from sklearn.preprocessing import StandardScaler
# scaling the columns
columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_percentage_past_exams','avg_activities_per_day_recent','active_days_ratio_recent','diversity_recent']


scaler = StandardScaler()
scaled_values = scaler.fit_transform(performances_math_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_math_features.index)
remaining_df = performances_math_features.drop(columns=columns_to_scale)
final_df = pd.concat([scaled_df, remaining_df], axis=1)

In [172]:
final_df.dropna(inplace=True)

In [173]:
import statsmodels.formula.api as smf

# Linear Regression Model
mod = smf.ols(formula= 'performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_percentage_past_exams + avg_activities_per_day_recent + active_days_ratio_recent + diversity_recent', data=final_df)

# Fit the model
res = mod.fit()

# Print regression results summary
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.237
Model:                            OLS   Adj. R-squared:                  0.236
Method:                 Least Squares   F-statistic:                     148.0
Date:                Thu, 17 Apr 2025   Prob (F-statistic):          1.18e-190
Time:                        16:38:07   Log-Likelihood:                -15488.
No. Observations:                3340   AIC:                         3.099e+04
Df Residuals:                    3332   BIC:                         3.104e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## test without the time dependant activity features

In [174]:
activity = pd.read_csv('data/features/activity.csv')
performances = pd.read_csv('data/features/performances.csv')

In [175]:
performances_math = performances[performances['domain']== 'math'].copy()
activity_math = activity[activity['domain']== 'math'].copy()

In [176]:
activity_math

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
0,1128,2533,42,42,course,0,2023-04-07 16:42:35,2023-04-07 17:35:15,2023-04-07 17:35:15,math,True,True,2023-04-07,0 days 00:52:40,52.666667,False
1,1129,2533,55,42,lesson,0,2023-04-07 16:42:35,,2023-04-07 16:42:35,math,False,True,2023-04-07,0 days 00:00:00,0.000000,False
2,1130,2533,98,42,topic,1,2023-04-07 16:42:38,2023-04-07 16:43:58,2023-04-07 16:43:58,math,False,True,2023-04-07,0 days 00:01:20,1.333333,False
3,1131,2533,100,42,topic,1,2023-04-07 16:43:59,2023-04-07 16:46:13,2023-04-07 16:46:13,math,False,True,2023-04-07,0 days 00:02:14,2.233333,False
4,1132,2533,102,42,topic,1,2023-04-07 16:46:14,2023-04-07 16:46:27,2023-04-07 16:46:27,math,False,True,2023-04-07,0 days 00:00:13,0.216667,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50137,114742,955,647,42,topic,0,2025-03-07 06:36:32,,2025-03-07 06:36:32,math,False,True,2025-03-07,0 days 00:00:00,0.000000,False
50138,114743,955,106,42,topic,0,2025-03-07 06:37:49,,2025-03-07 06:37:49,math,False,True,2025-03-07,0 days 00:00:00,0.000000,False
50139,114744,955,104,42,topic,0,2025-03-07 06:37:50,,2025-03-07 06:37:50,math,False,True,2025-03-07,0 days 00:00:00,0.000000,False
50140,114745,955,108,42,topic,0,2025-03-07 06:37:54,,2025-03-07 06:37:54,math,False,True,2025-03-07,0 days 00:00:00,0.000000,False


In [177]:
# Rolling window for recent activity
rolling_window_days = 10

# Convert the 'date' columns to datetime
activity_math['activity_updated'] = pd.to_datetime(activity_math['activity_updated'])
performances_math['time'] = pd.to_datetime(performances_math['time'])

def compute_all_features_for_exam_no_time(exam_row, user_activities, user_exams, window_days=rolling_window_days):

    exam_dt = exam_row['time']

    # Include activities up to and including exam_date
    previous_activities = user_activities[user_activities['activity_updated'] < exam_dt].copy()

    # Rolling window (activities in the last N days, including exam day)
    window_start = exam_dt - pd.Timedelta(days=window_days)
    rolling_activities = previous_activities[previous_activities['activity_updated'] >= window_start].copy()

    features = {}

    # Recent average time per activity (rolling window)
    #total_time_rolling = rolling_activities['time_in_minutes'].sum()
    count_rolling = len(rolling_activities)
    #features['recent_avg_time_per_activity'] = total_time_rolling / count_rolling if count_rolling > 0 else 0

    # Number of days since last activity
    if not previous_activities.empty:
        last_activity_date = previous_activities['activity_updated'].max()
        features['days_since_last_activity'] = (exam_dt - last_activity_date).days
    else:
        features['days_since_last_activity'] = np.nan

    # Total time spent on activities before the exam
    #features['total_time_spent_on_activity_before_exam'] = previous_activities['time_in_minutes'].sum() if not previous_activities.empty else 0

    # Average percentage on past exams
    previous_exams = user_exams[user_exams['time'] < exam_dt]
    features['average_percentage_past_exams'] = previous_exams['percentage'].mean() if not previous_exams.empty else np.nan

    # Usage Frequency: Average activities per day in rolling window & Active days ratio
    features['avg_activities_per_day_recent'] = count_rolling / window_days if window_days > 0 else np.nan
    if not rolling_activities.empty:
        distinct_days = rolling_activities['activity_updated'].dt.normalize().nunique()
    else:
        distinct_days = 0
    features['active_days_ratio_recent'] = distinct_days / window_days if window_days > 0 else np.nan

    # Activity diversity (rolling window)
    features['diversity_recent'] = rolling_activities['activity_type'].nunique() if not rolling_activities.empty else 0


    return pd.Series(features)

# Loop over each exam (grouped by user) in performances_math and compute all features.
features_list = []

for user_id, user_exams in performances_math.groupby('user_id'):
    # Get corresponding activities for the user from activity_math and sort by date
    user_activities = activity_math[activity_math['user_id'] == user_id].sort_values('activity_updated')
    user_exams_sorted = user_exams.sort_values('time')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam_no_time(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_math_features = performances_math.join(features_df, how='left')

In [178]:
performances_math_features

Unnamed: 0,user_id,domain,test_id,course,date,time,percentage,performance,days_since_last_activity,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
9.0,6,math,42,3865,2024-11-23,2024-11-23 10:25:34,25.00,-36.04,0.0,,0.4,0.2,2.0
10.0,6,math,48,3865,2025-01-08,2025-01-08 14:48:04,50.00,-1.92,0.0,29.822500,0.6,0.3,2.0
11.0,6,math,49,3865,2025-01-08,2025-01-08 15:29:07,66.67,21.23,0.0,33.858000,0.7,0.3,2.0
12.0,6,math,50,3865,2025-02-04,2025-02-04 15:36:38,54.55,19.57,1.0,39.326667,1.2,0.1,1.0
13.0,6,math,54,3865,2024-11-23,2024-11-23 11:26:10,14.29,-47.71,0.0,25.000000,0.9,0.2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4826.0,4095,math,49,3865,2024-10-18,2024-10-18 15:41:47,0.00,-45.44,0.0,20.625000,0.1,0.1,1.0
4827.0,4095,math,50,3865,2024-10-18,2024-10-18 15:49:00,0.00,-34.98,0.0,16.500000,0.2,0.1,1.0
4828.0,4095,math,51,3865,2024-09-24,2024-09-24 16:00:51,20.00,-22.98,0.0,,0.4,0.2,1.0
4829.0,4095,math,53,3865,2024-09-24,2024-09-24 16:07:01,0.00,-56.46,0.0,20.000000,0.5,0.2,2.0


In [179]:
from sklearn.preprocessing import StandardScaler
# scaling the columns
columns_to_scale = [ 'days_since_last_activity','average_percentage_past_exams','avg_activities_per_day_recent','active_days_ratio_recent','diversity_recent']


scaler = StandardScaler()
scaled_values = scaler.fit_transform(performances_math_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_math_features.index)
remaining_df = performances_math_features.drop(columns=columns_to_scale)
final_df = pd.concat([scaled_df, remaining_df], axis=1)

In [180]:
final_df.dropna(inplace=True)

In [182]:
import statsmodels.formula.api as smf

# Linear Regression Model
mod = smf.ols(
    formula='performance ~  days_since_last_activity + average_percentage_past_exams + avg_activities_per_day_recent + active_days_ratio_recent + diversity_recent',
    data=final_df)

# Fit the model
res = mod.fit()

# Print regression results summary
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.217
Model:                            OLS   Adj. R-squared:                  0.216
Method:                 Least Squares   F-statistic:                     184.6
Date:                Thu, 17 Apr 2025   Prob (F-statistic):          6.47e-174
Time:                        16:40:15   Log-Likelihood:                -15537.
No. Observations:                3341   AIC:                         3.109e+04
Df Residuals:                    3335   BIC:                         3.112e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     