In [64]:
### Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from IPython.display import Image

DATA_DIR = './data'

In [146]:
#### Removal of activities with a duration of 0 seconds
activity = pd.read_csv('{}/features/activity.csv'.format(DATA_DIR))
performances = pd.read_csv('{}/features/performances.csv'.format(DATA_DIR))
math_results = pd.read_csv('{}/original/math_results.csv'.format(DATA_DIR))
math_questions = pd.read_csv('{}/original/math_questions.csv'.format(DATA_DIR))

In [142]:
math_results

Unnamed: 0.1,Unnamed: 0,session_id,user_id,course_id,exam_id,question,points,max_points,time,hint_count,time_spent,correct
0,0,1,2437,42,2,1,0,1,1665067915,0,0,0
1,1,2,2437,42,2,2,0,1,1665069310,0,0,0
2,2,3,2437,42,2,3,0,1,1665070759,0,0,0
3,3,4,2437,42,2,6,0,1,1665074330,0,0,0
4,4,4,2437,42,2,7,0,1,1665074330,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
51351,56742,11982,956,3865,48,950,0,1,1741162135,1,10,0
51352,56743,11982,956,3865,48,952,1,1,1741162135,1,11,1
51353,56744,11982,956,3865,48,953,1,1,1741162135,1,11,1
51354,56745,11982,956,3865,48,955,1,1,1741162135,1,12,1


In [149]:
activity

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
0,1128,2533,42,42,course,0,2023-04-07 16:42:35,2023-04-07 17:35:15,2023-04-07 17:35:15,math,True,True,2023-04-07,0 days 00:52:40,52.666667,False
1,1129,2533,55,42,lesson,0,2023-04-07 16:42:35,,2023-04-07 16:42:35,math,False,True,2023-04-07,0 days 00:00:00,0.000000,False
2,1130,2533,98,42,topic,1,2023-04-07 16:42:38,2023-04-07 16:43:58,2023-04-07 16:43:58,math,False,True,2023-04-07,0 days 00:01:20,1.333333,False
3,1131,2533,100,42,topic,1,2023-04-07 16:43:59,2023-04-07 16:46:13,2023-04-07 16:46:13,math,False,True,2023-04-07,0 days 00:02:14,2.233333,False
4,1132,2533,102,42,topic,1,2023-04-07 16:46:14,2023-04-07 16:46:27,2023-04-07 16:46:27,math,False,True,2023-04-07,0 days 00:00:13,0.216667,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52739,627,2516,2000627,2115,exam,1,2025-01-04 13:42:05,2025-01-04 13:42:05,2025-01-04 13:42:05,text,False,True,2025-01-04,0 days 00:00:00,0.000000,False
52740,628,1629,2000628,5009,exam,1,2024-08-26 17:44:36,2024-08-26 18:25:07,2024-08-26 18:25:07,text,False,True,2024-08-26,0 days 00:40:31,40.516667,False
52741,629,2185,2000629,2115,exam,1,2024-10-30 14:02:31,2024-10-30 14:02:34,2024-10-30 14:02:34,text,False,True,2024-10-30,0 days 00:00:03,0.050000,False
52742,630,2516,2000630,5009,exam,1,2025-01-04 13:08:50,2025-01-04 13:11:53,2025-01-04 13:11:53,text,False,True,2025-01-04,0 days 00:03:03,3.050000,False


In [151]:
performances_math = performances[performances['domain']=='math']
performances_math

Unnamed: 0,user_id,domain,test_id,course,date,time,percentage,performance
9,6,math,42,3865,2024-11-23,2024-11-23 10:25:34,25.00,-36.04
10,6,math,48,3865,2025-01-08,2025-01-08 14:48:04,50.00,-1.92
11,6,math,49,3865,2025-01-08,2025-01-08 15:29:07,66.67,21.23
12,6,math,50,3865,2025-02-04,2025-02-04 15:36:38,54.55,19.57
13,6,math,54,3865,2024-11-23,2024-11-23 11:26:10,14.29,-47.71
...,...,...,...,...,...,...,...,...
4826,4095,math,49,3865,2024-10-18,2024-10-18 15:41:47,0.00,-45.44
4827,4095,math,50,3865,2024-10-18,2024-10-18 15:49:00,0.00,-34.98
4828,4095,math,51,3865,2024-09-24,2024-09-24 16:00:51,20.00,-22.98
4829,4095,math,53,3865,2024-09-24,2024-09-24 16:07:01,0.00,-56.46


In [66]:
activity.dropna(inplace=True)

In [67]:
# Rolling window for recent activity
rolling_window_days = 10


def compute_all_features_for_exam(exam_row, user_activities, user_exams, window_days=rolling_window_days):

    exam_dt = exam_row['time']


    # Include activities up to and including exam_date
    previous_activities = user_activities[user_activities['activity_updated'] < exam_dt].copy()


    # Rolling window (activities in the last N days, including exam day)
    window_start = exam_dt - pd.Timedelta(days=window_days)
    rolling_activities = previous_activities[previous_activities['activity_updated'] >= window_start].copy()

    features = {}


    # Recent average time per activity (rolling window)
    total_time_rolling = rolling_activities['time_in_minutes'].sum()
    count_rolling = len(rolling_activities)
    features['recent_avg_time_per_activity'] = total_time_rolling / count_rolling if count_rolling > 0 else 0


    # Number of days since last activity
    if not previous_activities.empty:
        last_activity_date = previous_activities['activity_updated'].max()
        features['days_since_last_activity'] = (exam_dt - last_activity_date).days
    else:
        features['days_since_last_activity'] = np.nan


    # Total time spent on activities before the exam
    features['total_time_spent_on_activity_before_exam'] = previous_activities['time_in_minutes'].sum() if not previous_activities.empty else 0


    # Average performance on past exams
    previous_exams = user_exams[user_exams['time'] < exam_dt]
    features['average_performance_past_exams'] = previous_exams['performance'].mean() if not previous_exams.empty else np.nan


    # Usage Frequency: Average activities per day in rolling window & Active days ratio
    features['avg_activities_per_day_recent'] = count_rolling / window_days if window_days > 0 else np.nan
    if not rolling_activities.empty:
        distinct_days = rolling_activities['activity_updated'].dt.normalize().nunique()
    else:
        distinct_days = 0
    features['active_days_ratio_recent'] = distinct_days / window_days if window_days > 0 else np.nan


    # Activity diversity (rolling window)
    features['diversity_recent'] = rolling_activities['activity_type'].nunique() if not rolling_activities.empty else 0


    return pd.Series(features)

In [68]:
scaler = StandardScaler()
columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_performance_past_exams','avg_activities_per_day_recent','diversity_recent','active_days_ratio_recent']

In [69]:
performances_math = performances[performances['domain']== 'math'].copy()
activity_math = activity[activity['domain']== 'math'].copy()

# Convert the 'date' columns to datetime
activity_math['activity_updated'] = pd.to_datetime(activity_math['activity_updated'])
performances_math['time'] = pd.to_datetime(performances_math['time'])

# Loop over each exam (grouped by user) in performances_math and compute all features.
features_list = []

for user_id, user_exams in performances_math.groupby('user_id'):
    # Get corresponding activities for the user from activity_math and sort by date
    user_activities = activity_math[activity_math['user_id'] == user_id].sort_values('activity_updated')
    user_exams_sorted = user_exams.sort_values('time')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_math_features = performances_math.join(features_df, how='left')

# scaling the columns
scaled_values = scaler.fit_transform(performances_math_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_math_features.index)
remaining_df = performances_math_features.drop(columns=columns_to_scale)
final_df_math = pd.concat([scaled_df, remaining_df], axis=1)

In [70]:
final_df_math_drop = final_df_math.copy()
final_df_math_drop.dropna(inplace=True)


In [71]:
final_df_math_drop

Unnamed: 0,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_performance_past_exams,avg_activities_per_day_recent,diversity_recent,active_days_ratio_recent,user_id,domain,test_id,course,date,time,percentage,performance
10.0,0.648468,-0.383865,0.260376,-1.133286,-0.233642,0.904798,0.161579,6,math,48,3865,2025-01-08,2025-01-08 14:48:04,50.00,-1.92
11.0,0.640372,-0.383865,0.345138,-0.918386,-0.108547,0.904798,0.161579,6,math,49,3865,2025-01-08,2025-01-08 15:29:07,66.67,21.23
12.0,-0.776035,1.328815,0.565134,-0.585662,-0.734023,-1.735784,-1.243938,6,math,50,3865,2025-02-04,2025-02-04 15:36:38,54.55,19.57
13.0,-0.324109,-0.383865,-0.819036,-1.734188,0.141643,0.904798,0.161579,6,math,54,3865,2024-11-23,2024-11-23 11:26:10,14.29,-47.71
14.0,-0.776035,3.707539,-0.088589,-2.020706,-0.734023,-1.735784,-1.243938,6,math,57,3865,2025-01-06,2025-01-06 17:40:42,40.00,-6.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825.0,-0.776035,1.138518,-0.354619,-1.720112,-0.734023,-1.735784,-1.243938,4095,math,48,3865,2024-10-18,2024-10-18 15:21:11,50.00,-1.92
4826.0,12.032986,-0.383865,0.429848,-1.304780,-0.608928,-0.415493,-0.541179,4095,math,49,3865,2024-10-18,2024-10-18 15:41:47,0.00,-45.44
4827.0,6.098907,-0.383865,0.487470,-1.482976,-0.483832,-0.415493,-0.541179,4095,math,50,3865,2024-10-18,2024-10-18 15:49:00,0.00,-34.98
4829.0,0.230279,-0.383865,-0.436615,-1.092899,-0.108547,0.904798,0.161579,4095,math,53,3865,2024-09-24,2024-09-24 16:07:01,0.00,-56.46


In [52]:

# Linear Regression Model
mod_method0 = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_performance_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df_math_drop)
# Fit the model
res_method0 = mod_method0.fit()

# Print regression results summary
#print(res_method0.summary())

In [53]:
# Print regression results summary
print(res_method0.summary())


final_df_math_drop['predicted_performance'] = res_method0.fittedvalues
rmse_method0 = mean_squared_error(final_df_math_drop["performance"], final_df_math_drop['predicted_performance'])
print('RMSE :', rmse_method0)

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.267
Model:                            OLS   Adj. R-squared:                  0.266
Method:                 Least Squares   F-statistic:                     173.6
Date:                Thu, 15 May 2025   Prob (F-statistic):          1.25e-219
Time:                        15:12:44   Log-Likelihood:                -15421.
No. Observations:                3340   AIC:                         3.086e+04
Df Residuals:                    3332   BIC:                         3.091e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## can the performance in math influence predict the essay results

In [114]:
def compute_math_features_for_exam(exam_row, user_activities, user_exams, window_days=rolling_window_days):
    exam_dt = pd.to_datetime(exam_row['time'])

    # 1) Restrict to math‐domain activities & exams
    math_acts = user_activities[user_activities['domain']=='math']
    math_exs =  user_exams   [user_exams   ['domain']=='math']

    # 2) All prior math activities
    prev_act = math_acts[math_acts['activity_updated'] < exam_dt]

    # 3) Rolling window of the last N days
    window_start = exam_dt - pd.Timedelta(days=window_days)
    roll_act = prev_act[prev_act['activity_updated'] >= window_start]

    feats = {}

    # Recent avg time per activity
    total_roll = roll_act['time_in_minutes'].sum()
    cnt_roll   = len(roll_act)
    feats['recent_avg_time_per_activity'] = total_roll/cnt_roll if cnt_roll>0 else 0.0

    # Days since last activity
    if not prev_act.empty:
        last_dt = prev_act['activity_updated'].max()
        feats['days_since_last_activity'] = (exam_dt - last_dt).days
    else:
        feats['days_since_last_activity'] = np.nan

    # Total time spent on math activities before exam
    feats['total_time_spent_on_activity_before_exam'] = prev_act['time_in_minutes'].sum()

    # Average performance on past math exams
    past_exams = math_exs[pd.to_datetime(math_exs['time']) < exam_dt]
    feats['average_performance_past_exams'] = past_exams['performance'].mean() if not past_exams.empty else np.nan

    # Usage frequency & active‐days ratio
    feats['avg_activities_per_day_recent'] = cnt_roll / window_days
    distinct_days = roll_act['activity_updated'].dt.normalize().nunique() if not roll_act.empty else 0
    feats['active_days_ratio_recent']    = distinct_days / window_days

    # Activity diversity in the window
    feats['diversity_recent'] = roll_act['activity_type'].nunique() if not roll_act.empty else 0

    return pd.Series(feats)

In [115]:
scaler = StandardScaler()
columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_performance_past_exams','avg_activities_per_day_recent','diversity_recent','active_days_ratio_recent']

In [124]:
import pandas as pd
import numpy as np

# --- 0) Prepare your domain-filtered logs ---
# performances contains all domains; activity contains all domains
performances_all = performances.copy()
activity_all     = activity.copy()

# Convert once at the top
performances_all['time']             = pd.to_datetime(performances_all['time'])
activity_all['activity_updated']     = pd.to_datetime(activity_all['activity_updated'])

# Now split out math‐only logs
performances_math = performances_all[performances_all['domain']=='math'].copy()
activity_math     = activity_all[activity_all['domain']=='math'].copy()

# And convert those subsets if you like (already datetime by reference)
performances_math['time']          = performances_math['time']
activity_math['activity_updated']  = activity_math['activity_updated']

# --- 1) Loop over every exam in performances_all, but compute features from math only ---
features_list = []

for user_id, user_exams_all in performances_all.groupby('user_id'):
    # math-only history for this user
    user_acts_math = activity_math[activity_math['user_id']==user_id] \
        .sort_values('activity_updated')
    user_exs_math  = performances_math[performances_math['user_id']==user_id] \
        .sort_values('time')

    # iterate through every exam (math, text, essay)
    for exam_index, exam_row in user_exams_all.sort_values('time').iterrows():
        feats = compute_all_features_for_exam(
            exam_row,
            user_activities=user_acts_math,
            user_exams   =user_exs_math,
            window_days  =rolling_window_days
        )
        feats['exam_index'] = exam_index
        features_list.append(feats)

# --- 2) Build the features DataFrame and join back onto performances_all ---
features_df = pd.DataFrame(features_list).set_index('exam_index')
perf_all_features = performances_all.join(features_df, how='left')

# --- 3) Scale your numeric feature columns as before ---
scaled_vals = scaler.fit_transform(perf_all_features[columns_to_scale])
scaled_df   = pd.DataFrame(scaled_vals,
                           columns=columns_to_scale,
                           index=perf_all_features.index)

remaining_df = perf_all_features.drop(columns=columns_to_scale)
final_df_all = pd.concat([scaled_df, remaining_df], axis=1)

# Now `final_df_all` contains, for every exam (math/text/essay), the seven features
# computed solely from the student’s math activities and math exams.


In [125]:
final_df_all

Unnamed: 0,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_performance_past_exams,avg_activities_per_day_recent,diversity_recent,active_days_ratio_recent,user_id,domain,test_id,course,date,time,percentage,performance
0.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,1,essay,eroerterung,3301,2024-11-29,2024-11-29 23:52:33,63.000000,-7.090000
1.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,1,essay,erzaehlung,5447,2024-10-26,2024-10-26 07:24:30,55.294118,-8.075882
2.0,-0.636948,0.063315,-0.936959,,0.249286,1.019169,-0.398982,4,essay,eroerterung,3301,2024-11-21,2024-11-21 17:23:46,66.000000,-4.090000
3.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,4,essay,erzaehlung,3301,2024-11-07,2024-11-07 16:13:25,71.000000,3.390000
4.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,5,essay,erzaehlung,5447,2024-10-26,2024-10-26 07:23:58,44.705882,-18.664118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4831.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,4095,text,1,2115,2024-08-21,2024-08-21 14:48:35,0.000000,0.000000
4832.0,-0.718366,,-0.972132,,-0.685142,-1.444566,-1.102915,4095,text,10,5009,2024-08-21,2024-08-21 14:47:59,32.727273,-15.982727
4833.0,-0.718366,0.215291,0.574621,-1.519912,-0.685142,-1.444566,-1.102915,4095,text,13,5009,2024-10-29,2024-10-29 19:30:38,40.298507,-12.121493
4834.0,-0.718366,0.265950,1.834834,-1.171914,-0.685142,-1.444566,-1.102915,4095,text,14,5009,2024-11-25,2024-11-25 19:12:15,52.702703,-6.777297


In [126]:
final_df_math = final_df_all[final_df_all['domain']=='math']
final_df_essay = final_df_all[final_df_all['domain']=='essay']
final_df_text = final_df_all[final_df_all['domain']=='text']

In [127]:
final_df_math.dropna(inplace=True)
final_df_essay.dropna(inplace=True)
final_df_text.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_math.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_essay.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_text.dropna(inplace=True)


In [128]:
# Linear Regression Model
mod_method0 = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_performance_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df_math)
# Fit the model
res_method0 = mod_method0.fit()

# Print regression results summary
#print(res_method0.summary())

In [129]:
# Print regression results summary
print(res_method0.summary())


final_df_math['predicted_performance'] = res_method0.fittedvalues
rmse_method0 = mean_squared_error(final_df_math["performance"], final_df_math['predicted_performance'])
print('RMSE :', rmse_method0)

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.267
Model:                            OLS   Adj. R-squared:                  0.265
Method:                 Least Squares   F-statistic:                     202.1
Date:                Thu, 15 May 2025   Prob (F-statistic):          2.41e-220
Time:                        16:21:02   Log-Likelihood:                -15422.
No. Observations:                3340   AIC:                         3.086e+04
Df Residuals:                    3333   BIC:                         3.090e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_math['predicted_performance'] = res_method0.fittedvalues


In [130]:
# Linear Regression Model
mod_method0 = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_performance_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df_essay)
# Fit the model
res_method0 = mod_method0.fit()

# Print regression results summary
#print(res_method0.summary())

In [131]:
# Print regression results summary
print(res_method0.summary())

final_df_essay['predicted_performance'] = res_method0.fittedvalues
rmse_method0 = mean_squared_error(final_df_essay["performance"], final_df_essay['predicted_performance'])
print('RMSE :', rmse_method0)

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     2.421
Date:                Thu, 15 May 2025   Prob (F-statistic):             0.0271
Time:                        16:21:32   Log-Likelihood:                -1099.9
No. Observations:                 261   AIC:                             2214.
Df Residuals:                     254   BIC:                             2239.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_essay['predicted_performance'] = res_method0.fittedvalues


In [132]:
# Linear Regression Model
mod_method0 = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_performance_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df_text)
# Fit the model
res_method0 = mod_method0.fit()

# Print regression results summary
#print(res_method0.summary())

In [133]:
# Print regression results summary
print(res_method0.summary())

final_df_text['predicted_performance'] = res_method0.fittedvalues
rmse_method0 = mean_squared_error(final_df_text["performance"], final_df_text['predicted_performance'])
print('RMSE :', rmse_method0)

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     2.250
Date:                Thu, 15 May 2025   Prob (F-statistic):             0.0384
Time:                        16:22:41   Log-Likelihood:                -1327.9
No. Observations:                 325   AIC:                             2670.
Df Residuals:                     318   BIC:                             2696.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_text['predicted_performance'] = res_method0.fittedvalues


In [72]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
from datetime import timedelta

# 1) Split your raw logs
perf_all = performances.copy()     # has domain=text,essay,math
act_all  = activity.copy()         # has all domains, but we'll filter
act_all['activity_updated'] = pd.to_datetime(act_all['activity_updated'])
perf_all['time'] = pd.to_datetime(perf_all['time'])

perf_math = perf_all[perf_all['domain']=='math']
act_math  = act_all[act_all['domain']=='math']


# 2) A function to compute math‐based features for ANY exam DataFrame
def compute_math_features_for_exams(target_perf, perf_math, act_math, window_days=10):
    feats = []
    for _, exam in target_perf.iterrows():
        uid, exam_time = exam['user_id'], pd.to_datetime(exam['time'])
        # math‐only history
        ua = act_math[act_math['user_id']==uid]
        up = perf_math[perf_math['user_id']==uid]
        # activities up to exam_time
        prev_act = ua[ua['activity_updated'] <= exam_time]
        # rolling window
        start = exam_time - timedelta(days=window_days)
        roll_act = prev_act[prev_act['activity_updated'] >= start]

        # past math exams
        past_exams = up[pd.to_datetime(up['time']) < exam_time]

        feats.append({
            'user_id': uid,
            'domain':  exam['domain'],
            'test_id': exam['test_id'],
            'course':  exam['course'],
            'date':    exam['time'],
            'percentage': exam['percentage'],
            'performance': exam['performance'],
            # the 7 features computed from math‐domain history
            'recent_avg_time_per_activity': roll_act['time_in_minutes'].sum() / len(roll_act) if len(roll_act)>0 else np.nan,
            'days_since_last_activity': (exam_time - prev_act['activity_updated'].max()).days if not prev_act.empty else np.nan,
            'total_time_spent_on_activity_before_exam': prev_act['time_in_minutes'].sum(),
            'average_performance_past_exams': past_exams['percentage'].mean() if not past_exams.empty else np.nan,
            'avg_activities_per_day_recent': len(roll_act) / window_days,
            'active_days_ratio_recent': prev_act['activity_updated'].dt.normalize().nunique() / window_days,
            'diversity_recent': roll_act['activity_type'].nunique()
        })
    return pd.DataFrame(feats)

# 3) Build feature‐DFs for each domain
df_math  = compute_math_features_for_exams(perf_math,  perf_math,  act_math)
df_text  = compute_math_features_for_exams(perf_all[perf_all['domain']=='text'],  perf_math, act_math)
df_essay = compute_math_features_for_exams(perf_all[perf_all['domain']=='essay'], perf_math, act_math)

In [73]:
# scaling the columns
scaled_values_math = scaler.fit_transform(df_math[columns_to_scale])
scaled_df_math = pd.DataFrame(scaled_values_math, columns=columns_to_scale, index=df_math.index)
remaining_df_math = df_math.drop(columns=columns_to_scale)
final_df_math = pd.concat([scaled_df_math, remaining_df_math], axis=1)
final_df_math_drop = final_df_math.copy()
final_df_math_drop.dropna(inplace=True)

In [74]:
final_df_math_drop

Unnamed: 0,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_performance_past_exams,avg_activities_per_day_recent,diversity_recent,active_days_ratio_recent,user_id,domain,test_id,course,date,percentage,performance
1,0.337197,-0.154097,0.279290,-1.069232,-0.209791,0.763798,-0.468060,6,math,48,3865,2025-01-08 14:48:04,50.00,-1.92
2,0.389653,-0.154097,0.386660,-0.888050,-0.084724,0.763798,-0.468060,6,math,49,3865,2025-01-08 15:29:07,66.67,21.23
3,3.059035,-0.154097,0.771570,-0.642524,-0.710059,-0.872912,0.132718,6,math,50,3865,2025-02-04 15:36:38,54.55,19.57
4,-0.467800,-0.154097,-0.824657,-1.285748,0.165410,0.763798,-0.868578,6,math,54,3865,2024-11-23 11:26:10,14.29,-47.71
5,1.856584,-0.154097,0.044267,-1.526171,-0.710059,-0.872912,-0.668319,6,math,57,3865,2025-01-06 17:40:42,40.00,-6.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3804,10.144617,-0.154097,0.362629,-1.921789,-0.710059,-0.872912,-0.468060,4095,math,48,3865,2024-10-18 15:21:11,50.00,-1.92
3805,5.036219,-0.154097,0.419318,-1.482172,-0.584992,-0.872912,-0.468060,4095,math,49,3865,2024-10-18 15:41:47,0.00,-45.44
3806,3.153186,-0.154097,0.438163,-1.667372,-0.459925,-0.872912,-0.468060,4095,math,50,3865,2024-10-18 15:49:00,0.00,-34.98
3808,-0.128304,-0.154097,-0.476409,-1.510233,-0.084724,0.763798,-0.868578,4095,math,53,3865,2024-09-24 16:07:01,0.00,-56.46


In [76]:
# Linear Regression Model
mod_method0 = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_performance_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df_math_drop)
# Fit the model
res_method0 = mod_method0.fit()

# Print regression results summary
#print(res_method0.summary())

In [77]:
# Print regression results summary
print(res_method0.summary())


final_df_math_drop['predicted_performance'] = res_method0.fittedvalues
rmse_method0 = mean_squared_error(final_df_math_drop["performance"], final_df_math_drop['predicted_performance'])
print('RMSE :', rmse_method0)

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     183.7
Date:                Thu, 15 May 2025   Prob (F-statistic):          5.63e-202
Time:                        15:23:00   Log-Likelihood:                -15137.
No. Observations:                3272   AIC:                         3.029e+04
Df Residuals:                    3265   BIC:                         3.033e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------