## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data 

In [2]:
user_days = pd.read_csv('data/features/user_days.csv')
activity = pd.read_csv('data/features/activity.csv')
performances = pd.read_csv('data/features/performances.csv')

Let's take a quick look at the data

In [3]:
activity[activity['activity_type']=='exam'].head() # this is weird --> exam as activity and most of those have time spent = 0

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
50145,0,1286,1000000,5447,exam,1,2024-07-12 10:21:47,,2024-07-12 10:21:47,essay,False,True,2024-07-12,0 days 00:00:00,0.0,False
50146,1,1107,1000001,3301,exam,1,2024-07-12 14:53:34,,2024-07-12 14:53:34,essay,False,True,2024-07-12,0 days 00:00:00,0.0,False
50147,2,1286,1000002,5447,exam,1,2024-08-02 16:12:49,,2024-08-02 16:12:49,essay,False,True,2024-08-02,0 days 00:00:00,0.0,False
50148,3,1091,1000003,5447,exam,1,2024-08-05 11:17:34,,2024-08-05 11:17:34,essay,False,True,2024-08-05,0 days 00:00:00,0.0,False
50149,4,1091,1000004,5447,exam,1,2024-08-07 07:53:34,,2024-08-07 07:53:34,essay,False,True,2024-08-07,0 days 00:00:00,0.0,False


In [4]:
activity[activity['activity_type']=='exam'].time_in_minutes.value_counts().head()

time_in_minutes
0.000000     2225
75.000000      49
0.083333        7
0.133333        4
0.100000        3
Name: count, dtype: int64

In [5]:
activity.activity_type.value_counts()

activity_type
topic     30850
lesson    11348
quiz       6742
exam       2599
course      717
access      488
Name: count, dtype: int64

In [56]:
performances.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance
0,1,essay,eroerterung,3301,2024-11-29,63.0,-7.09
1,1,essay,erzaehlung,5447,2024-10-26,55.294118,-8.075882
2,4,essay,eroerterung,3301,2024-11-21,66.0,-4.09
3,4,essay,erzaehlung,3301,2024-11-07,71.0,3.39
4,5,essay,erzaehlung,5447,2024-10-26,44.705882,-18.664118


In [57]:
user_days.head()

Unnamed: 0,user_id,date,type,user_day,number_of_activities,domain,activity_type,time_in_minutes
0,1,2024-10-26,both,1,2,essay,lesson,0.0
1,1,2024-10-30,activity,2,0,,,
2,1,2024-10-31,activity,3,1,text,lesson,0.0
3,1,2024-11-01,activity,4,6,essay,lesson,0.0
4,1,2024-11-01,activity,4,6,text,lesson,0.0


## Feature engineering 

In [7]:
# Rolling window for recent activity
rolling_window_days = 10


def compute_all_features_for_exam(exam_row, user_activities, user_exams, window_days=rolling_window_days):

    exam_date = exam_row['date']

    # Include activities up to and including exam_date
    previous_activities = user_activities[user_activities['date'] <= exam_date].copy() # need to modify to have the exact date and time in the performance df, because here activities that happend after the exam on the same day will be accounted for.

    # Rolling window (activities in the last N days, including exam day)
    window_start = exam_date - pd.Timedelta(days=window_days)
    rolling_activities = previous_activities[previous_activities['date'] >= window_start].copy()

    features = {}

    # Recent average time per activity (rolling window)
    total_time_rolling = rolling_activities['time_in_minutes'].sum()
    count_rolling = len(rolling_activities)
    features['recent_avg_time_per_activity'] = total_time_rolling / count_rolling if count_rolling > 0 else np.nan

    # Number of days since last activity
    if not previous_activities.empty:
        last_activity_date = previous_activities['date'].max()
        features['days_since_last_activity'] = (exam_date - last_activity_date).days
    else:
        features['days_since_last_activity'] = np.nan

    # Total time spent on activities before the exam
    features['total_time_spent_on_activity_before_exam'] = previous_activities['time_in_minutes'].sum() if not previous_activities.empty else np.nan

    # Average percentage on past exams
    previous_exams = user_exams[user_exams['date'] < exam_date]
    features['average_percentage_past_exams'] = previous_exams['percentage'].mean() if not previous_exams.empty else np.nan

    # Usage Frequency: Average activities per day in rolling window & Active days ratio
    features['avg_activities_per_day_recent'] = count_rolling / window_days if window_days > 0 else np.nan
    if not rolling_activities.empty:
        distinct_days = rolling_activities['date'].dt.normalize().nunique()
    else:
        distinct_days = 0
    features['active_days_ratio_recent'] = distinct_days / window_days if window_days > 0 else np.nan

    # Activity diversity (rolling window)
    features['diversity_recent'] = rolling_activities['activity_type'].nunique() if not rolling_activities.empty else np.nan


    return pd.Series(features)

## Predicting math exam results

In [8]:
user_days_math = user_days[user_days['domain']== 'math'].copy()
performances_math = performances[performances['domain']== 'math'].copy()
activity_math = activity[activity['domain']== 'math'].copy()

In [9]:
user_days_math.head()

Unnamed: 0,user_id,date,type,user_day,number_of_activities,domain,activity_type,time_in_minutes
21,1,2025-02-13,activity,19,2,math,topic,16.116667
22,1,2025-02-15,activity,20,1,math,topic,30.0
23,1,2025-02-16,activity,21,2,math,lesson,0.0
24,1,2025-02-16,activity,21,2,math,topic,0.483333
25,1,2025-02-17,activity,22,3,math,lesson,0.0


In [24]:
performances_math.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance
9,6,math,42,3865,2024-11-23,25.0,-36.04
10,6,math,48,3865,2025-01-08,50.0,-1.92
11,6,math,49,3865,2025-01-08,66.67,21.23
12,6,math,50,3865,2025-02-04,54.55,19.57
13,6,math,54,3865,2024-11-23,14.29,-47.71


In [10]:
activity_math.head()

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
0,1128,2533,42,42,course,0,2023-04-07 16:42:35,2023-04-07 17:35:15,2023-04-07 17:35:15,math,True,True,2023-04-07,0 days 00:52:40,52.666667,False
1,1129,2533,55,42,lesson,0,2023-04-07 16:42:35,,2023-04-07 16:42:35,math,False,True,2023-04-07,0 days 00:00:00,0.0,False
2,1130,2533,98,42,topic,1,2023-04-07 16:42:38,2023-04-07 16:43:58,2023-04-07 16:43:58,math,False,True,2023-04-07,0 days 00:01:20,1.333333,False
3,1131,2533,100,42,topic,1,2023-04-07 16:43:59,2023-04-07 16:46:13,2023-04-07 16:46:13,math,False,True,2023-04-07,0 days 00:02:14,2.233333,False
4,1132,2533,102,42,topic,1,2023-04-07 16:46:14,2023-04-07 16:46:27,2023-04-07 16:46:27,math,False,True,2023-04-07,0 days 00:00:13,0.216667,False


In [13]:
activity_math.dropna(inplace=True) # not sure what to do here, drops almost 50% of the rows but would those rows would fuck up the time spent on activity feature as it was 0 for all of them

In [14]:
activity_math.activity_type.value_counts()

activity_type
topic     6699
quiz      6278
lesson     143
access      74
course      68
Name: count, dtype: int64

In [18]:
# Convert the 'date' columns to datetime
activity_math['date'] = pd.to_datetime(activity_math['date'])
performances_math['date'] = pd.to_datetime(performances_math['date'])

# Loop over each exam (grouped by user) in performances_math and compute all features.
features_list = []

for user_id, user_exams in performances_math.groupby('user_id'):
    # Get corresponding activities for the user from activity_math and sort by date
    user_activities = activity_math[activity_math['user_id'] == user_id].sort_values('date')
    user_exams_sorted = user_exams.sort_values('date')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_math_features = performances_math.join(features_df, how='left')

In [19]:
performances_math_features.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
9.0,6,math,42,3865,2024-11-23,25.0,-36.04,22.701282,0.0,299.533333,,1.3,0.2,2.0
10.0,6,math,48,3865,2025-01-08,50.0,-1.92,28.891667,0.0,472.883333,29.8225,0.6,0.2,2.0
11.0,6,math,49,3865,2025-01-08,66.67,21.23,28.891667,0.0,472.883333,29.8225,0.6,0.2,2.0
12.0,6,math,50,3865,2025-02-04,54.55,19.57,89.533333,0.0,597.816667,39.326667,0.1,0.1,1.0
13.0,6,math,54,3865,2024-11-23,14.29,-47.71,22.701282,0.0,299.533333,,1.3,0.2,2.0


 Here not sure how we should handle the average percentage on last exams when it's the first exam a student takes do we take the average ?


In [20]:
from sklearn.preprocessing import StandardScaler


# scaling the columns
columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_percentage_past_exams','avg_activities_per_day_recent','diversity_recent']


scaler = StandardScaler()
scaled_values = scaler.fit_transform(performances_math_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_math_features.index)
remaining_df = performances_math_features.drop(columns=columns_to_scale)
final_df = pd.concat([scaled_df, remaining_df], axis=1)

In [21]:
final_df.isnull().sum()

recent_avg_time_per_activity                  8
days_since_last_activity                      2
total_time_spent_on_activity_before_exam      2
average_percentage_past_exams               777
avg_activities_per_day_recent                 0
diversity_recent                              8
user_id                                       0
domain                                        0
test_id                                       0
course                                        0
date                                          0
percentage                                    0
performance                                   0
active_days_ratio_recent                      0
dtype: int64

In [22]:
final_df.dropna(inplace=True)

In [23]:
import statsmodels.formula.api as smf

# Linear Regression Model
mod = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_percentage_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df)

# Fit the model
res = mod.fit()

# Print regression results summary
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.240
Model:                            OLS   Adj. R-squared:                  0.238
Method:                 Least Squares   F-statistic:                     158.7
Date:                Thu, 17 Apr 2025   Prob (F-statistic):          1.20e-175
Time:                        11:51:03   Log-Likelihood:                -14026.
No. Observations:                3028   AIC:                         2.807e+04
Df Residuals:                    3021   BIC:                         2.811e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## Predicting the essay results

In [25]:
performances_essay = performances[performances['domain']== 'essay'].copy()
activity_essay= activity[activity['domain']== 'essay'].copy()

In [26]:
activity_essay.isnull().sum() # big issue with the activity completed comlumn, way too many nans.

activity_id              0
user_id                  0
post_id                  0
course_id                0
activity_type            0
activity_status          0
activity_started         0
activity_completed    9550
activity_updated         0
domain                   0
date_restored            0
times_valid              0
date                     0
time_spent               0
time_in_minutes          0
time_truncated           0
dtype: int64

In [27]:
activity_essay.dropna(inplace=True)

In [29]:
activity_essay.head()

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
241,23817,2560,3301,3301,access,0,2023-11-26 08:58:42,2023-11-26 08:58:49,2023-11-26 08:58:49,essay,True,True,2023-11-26,0 days 00:00:07,0.116667,False
258,24116,2574,5447,5447,access,0,2023-11-27 13:33:10,2023-11-27 13:33:18,2023-11-27 13:33:18,essay,True,True,2023-11-27,0 days 00:00:08,0.133333,False
263,24131,2574,5466,5447,topic,1,2023-11-27 13:52:33,2023-11-27 15:54:13,2023-11-27 15:54:13,essay,False,True,2023-11-27,0 days 00:30:00,30.0,True
264,24132,2574,5467,5447,topic,1,2023-11-27 13:52:37,2023-11-27 15:53:51,2023-11-27 15:53:51,essay,False,True,2023-11-27,0 days 00:30:00,30.0,True
265,24133,2574,5469,5447,topic,1,2023-11-27 13:52:39,2023-11-27 15:50:05,2023-11-27 15:50:05,essay,False,True,2023-11-27,0 days 00:30:00,30.0,True


In [30]:
performances_essay.isnull().sum()

user_id        0
domain         0
test_id        0
course         0
date           0
percentage     0
performance    0
dtype: int64

In [31]:
# Convert the date columns  to datetime
activity_essay['date'] = pd.to_datetime(activity_essay['date'])
performances_essay['date'] = pd.to_datetime(performances_essay['date'])

# Loop over each exam (grouped by user) in performances_essay and compute all features.
features_list = []

for user_id, user_exams in performances_essay.groupby('user_id'):
    # Get corresponding activities for the user from activity_essay and sort by date
    user_activities = activity_essay[activity_essay['user_id'] == user_id].sort_values('date')
    user_exams_sorted = user_exams.sort_values('date')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_essay_features = performances_essay.join(features_df, how='left')

In [33]:
performances_essay_features.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
0.0,1,essay,eroerterung,3301,2024-11-29,63.0,-7.09,,27.0,4.983333,55.294118,0.0,0.0,
1.0,1,essay,erzaehlung,5447,2024-10-26,55.294118,-8.075882,,,,,0.0,0.0,
2.0,4,essay,eroerterung,3301,2024-11-21,66.0,-4.09,11.233333,10.0,22.466667,71.0,0.2,0.1,1.0
3.0,4,essay,erzaehlung,3301,2024-11-07,71.0,3.39,,,,,0.0,0.0,
4.0,5,essay,erzaehlung,5447,2024-10-26,44.705882,-18.664118,,,,,0.0,0.0,


In [34]:
performances_essay_features.isnull().sum() # too many nans in the features for essay.

user_id                                       0
domain                                        0
test_id                                       0
course                                        0
date                                          0
percentage                                    0
performance                                   0
recent_avg_time_per_activity                434
days_since_last_activity                    177
total_time_spent_on_activity_before_exam    177
average_percentage_past_exams               328
avg_activities_per_day_recent                 0
active_days_ratio_recent                      0
diversity_recent                            434
dtype: int64

In [35]:
performances_essay_features.dropna(inplace=True)

In [37]:
performances_essay_features.head() # only 65 rows left

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
2.0,4,essay,eroerterung,3301,2024-11-21,66.0,-4.09,11.233333,10.0,22.466667,71.0,0.2,0.1,1.0
5.0,6,essay,bericht,3301,2024-11-05,60.0,-11.48,59.479167,10.0,475.833333,52.0,0.8,0.1,3.0
54.0,17,essay,beschreibung,5447,2024-11-17,27.058824,-43.721176,2.166667,5.0,2.166667,30.588235,0.1,0.1,1.0
189.0,81,essay,eroerterung,3301,2024-12-30,69.0,-1.09,3.756667,0.0,212.3,62.0,0.5,0.1,1.0
321.0,224,essay,beschreibung,3301,2024-11-08,65.0,-4.96,7.938889,7.0,23.816667,57.0,0.3,0.1,2.0


In [38]:
# scale the df

columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_percentage_past_exams','avg_activities_per_day_recent','diversity_recent']


scaler = StandardScaler()
scaled_values = scaler.fit_transform(performances_essay_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_essay_features.index)

remaining_df = performances_essay_features.drop(columns=columns_to_scale)

final_df = pd.concat([scaled_df, remaining_df], axis=1)

In [39]:
# Linear Regression Model
mod = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_percentage_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df)

# Fit the model
res = mod.fit()

# Print regression results summary
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                     5.190
Date:                Thu, 17 Apr 2025   Prob (F-statistic):           0.000248
Time:                        11:55:13   Log-Likelihood:                -249.48
No. Observations:                  65   AIC:                             513.0
Df Residuals:                      58   BIC:                             528.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## Predicting the text results 


In [40]:
performances_text = performances[performances['domain']== 'text'].copy()
activity_text= activity[activity['domain']== 'text'].copy()

In [41]:
activity_text.isnull().sum() # issue with the activity completed comlumn, way too many nans. Idk how we should handle those.

activity_id              0
user_id                  0
post_id                  0
course_id                0
activity_type            0
activity_status          0
activity_started         0
activity_completed    7666
activity_updated         0
domain                   0
date_restored            0
times_valid              0
date                     0
time_spent               0
time_in_minutes          0
time_truncated           0
dtype: int64

In [43]:
activity_text.head()

Unnamed: 0,activity_id,user_id,post_id,course_id,activity_type,activity_status,activity_started,activity_completed,activity_updated,domain,date_restored,times_valid,date,time_spent,time_in_minutes,time_truncated
18,1207,2533,2115,2115,access,0,2023-05-02 08:49:46,2023-05-02 08:49:46,2023-05-02 08:49:46,text,True,True,2023-05-02,0 days 00:00:00,0.0,False
19,1208,2533,2115,2115,course,0,2023-05-02 08:50:00,2023-05-02 08:50:16,2023-05-02 08:50:16,text,True,True,2023-05-02,0 days 00:00:16,0.266667,False
20,1209,2533,2150,2115,lesson,0,2023-05-02 08:50:00,,2023-05-02 08:50:00,text,False,True,2023-05-02,0 days 00:00:00,0.0,False
21,1210,2533,2117,2115,topic,1,2023-05-02 08:50:00,2023-05-02 08:50:16,2023-05-02 08:50:16,text,False,True,2023-05-02,0 days 00:00:16,0.266667,False
22,1211,2533,2152,2115,topic,0,2023-05-02 08:50:16,,2023-05-02 08:50:16,text,False,True,2023-05-02,0 days 00:00:00,0.0,False


In [44]:
activity_text.dropna(inplace=True)

In [45]:
# Convert the date columns to datetime
activity_text['date'] = pd.to_datetime(activity_text['date'])
performances_text['date'] = pd.to_datetime(performances_text['date'])

# Loop over each exam (grouped by user) in performances_text and compute all features.
features_list = []

for user_id, user_exams in performances_text.groupby('user_id'):
    # Get corresponding activities for the user from activity_text and sort by date
    user_activities = activity_text[activity_text['user_id'] == user_id].sort_values('date')
    user_exams_sorted = user_exams.sort_values('date')

    for exam_index, exam_row in user_exams_sorted.iterrows():
        feats = compute_all_features_for_exam(exam_row, user_activities, user_exams_sorted, rolling_window_days)
        feats['exam_index'] = exam_index
        features_list.append(feats)

# Output df
features_df = pd.DataFrame(features_list).set_index('exam_index')
performances_text_features = performances_text.join(features_df, how='left')

In [46]:
performances_text_features.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
192.0,81,text,1,2115,2025-01-04,0.0,0.0,,11.0,6.1,,0.0,0.0,
193.0,81,text,10,5009,2025-01-04,0.0,-48.71,,11.0,6.1,,0.0,0.0,
205.0,90,text,1,2115,2024-12-15,0.0,0.0,14.372222,0.0,43.116667,,0.3,0.2,2.0
206.0,90,text,3,2115,2024-12-17,76.086957,29.136957,9.577778,0.0,57.466667,31.0,0.6,0.3,2.0
207.0,90,text,4,2115,2024-12-15,62.0,18.72,14.372222,0.0,43.116667,,0.3,0.2,2.0


In [47]:
performances_text_features.isnull().sum() # too many nans

user_id                                       0
domain                                        0
test_id                                       0
course                                        0
date                                          0
percentage                                    0
performance                                   0
recent_avg_time_per_activity                363
days_since_last_activity                    304
total_time_spent_on_activity_before_exam    304
average_percentage_past_exams               219
avg_activities_per_day_recent                 0
active_days_ratio_recent                      0
diversity_recent                            363
dtype: int64

In [48]:
performances_text_features.dropna(inplace=True)

In [50]:
performances_text_features.head()

Unnamed: 0,user_id,domain,test_id,course,date,percentage,performance,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,active_days_ratio_recent,diversity_recent
206.0,90,text,3,2115,2024-12-17,76.086957,29.136957,9.577778,0.0,57.466667,31.0,0.6,0.3,2.0
208.0,90,text,5,2115,2024-12-21,59.615385,2.835385,8.62963,0.0,77.666667,46.028986,0.9,0.4,2.0
209.0,90,text,6,2115,2024-12-23,61.22449,13.43449,8.62963,2.0,77.666667,49.425585,0.9,0.4,2.0
210.0,90,text,7,2115,2024-12-28,59.090909,-3.179091,6.733333,7.0,77.666667,51.785366,0.3,0.1,1.0
211.0,90,text,8,2115,2024-12-29,51.515152,6.445152,6.733333,8.0,77.666667,53.002957,0.3,0.1,1.0


In [51]:
# Scale the columns
columns_to_scale = ['recent_avg_time_per_activity', 'days_since_last_activity', 'total_time_spent_on_activity_before_exam','average_percentage_past_exams','avg_activities_per_day_recent','diversity_recent']

scaler = StandardScaler()
scaled_values = scaler.fit_transform(performances_text_features[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale, index=performances_text_features.index)

remaining_df = performances_text_features.drop(columns=columns_to_scale)

final_df = pd.concat([scaled_df, remaining_df], axis=1)

In [53]:
final_df.head() # only 25 rows left, not really relevant

Unnamed: 0,recent_avg_time_per_activity,days_since_last_activity,total_time_spent_on_activity_before_exam,average_percentage_past_exams,avg_activities_per_day_recent,diversity_recent,user_id,domain,test_id,course,date,percentage,performance,active_days_ratio_recent
206.0,0.388782,-0.81292,-0.323634,-0.532226,0.256319,2.0,90,text,3,2115,2024-12-17,76.086957,29.136957,0.3
208.0,0.231016,-0.81292,-0.064234,0.570346,0.968316,2.0,90,text,5,2115,2024-12-21,59.615385,2.835385,0.4
209.0,0.231016,-0.167745,-0.064234,0.819531,0.968316,2.0,90,text,6,2115,2024-12-23,61.22449,13.43449,0.4
210.0,-0.084516,1.445191,-0.064234,0.992652,-0.455678,-0.5,90,text,7,2115,2024-12-28,59.090909,-3.179091,0.1
211.0,-0.084516,1.767779,-0.064234,1.081978,-0.455678,-0.5,90,text,8,2115,2024-12-29,51.515152,6.445152,0.1


In [54]:
# Linear Regression Model
mod = smf.ols(formula='performance ~  recent_avg_time_per_activity + days_since_last_activity + total_time_spent_on_activity_before_exam + average_percentage_past_exams + avg_activities_per_day_recent + diversity_recent', data=final_df)

# Fit the model
res = mod.fit()

# Print regression results summary
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            performance   R-squared:                       0.175
Model:                            OLS   Adj. R-squared:                 -0.100
Method:                 Least Squares   F-statistic:                    0.6372
Date:                Thu, 17 Apr 2025   Prob (F-statistic):              0.699
Time:                        11:56:14   Log-Likelihood:                -98.268
No. Observations:                  25   AIC:                             210.5
Df Residuals:                      18   BIC:                             219.1
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------