## Import Cleaned Data

In [15]:
import pandas as pd

df = pd.read_csv("../Data/Cleaned/cleaned_data.csv")
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds
0,2019-08-06 12:56:30.868000+00:00,1,q5012,b,u1,0.7
1,2019-08-06 12:57:01.062000+00:00,2,q4706,c,u1,0.7
2,2019-08-06 12:58:13.432000+00:00,3,q4366,b,u1,0.7
3,2019-08-06 12:58:59.668000+00:00,4,q4829,a,u1,0.7
4,2019-08-06 13:00:01.774000+00:00,5,q6528,b,u1,0.7


## Inferring Correct Answers (Weak Supervision)

The dataset does not include official correct answers.
Therefore, we assume that **the most frequently chosen answer for each
question represents the correct answer**.

This technique is known as **majority voting** and is commonly used
when ground-truth labels are unavailable.


In [16]:
assumed_answers = (
    df.groupby("question_id")["user_answer"]
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index(name="assumed_correct_answer")
)

assumed_answers.head()

Unnamed: 0,question_id,assumed_correct_answer
0,q1,b
1,q10,c
2,q100,c
3,q1000,c
4,q10000,b


## Merge the [assumed_correct_answers] column with the cleaned Dataset

In [17]:
df = df.merge(assumed_answers, on="question_id", how="left")

df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds,assumed_correct_answer
0,2019-08-06 12:56:30.868000+00:00,1,q5012,b,u1,0.7,b
1,2019-08-06 12:57:01.062000+00:00,2,q4706,c,u1,0.7,c
2,2019-08-06 12:58:13.432000+00:00,3,q4366,b,u1,0.7,b
3,2019-08-06 12:58:59.668000+00:00,4,q4829,a,u1,0.7,c
4,2019-08-06 13:00:01.774000+00:00,5,q6528,b,u1,0.7,d


## Creating the is_correct Feature

Using the inferred correct answers, we create a binary feature:

- `1` → student answer matches inferred correct answer
- `0` → otherwise

This feature is critical for estimating question difficulty and student performance.


In [18]:
df["is_correct"] = (
    df["user_answer"] == df["assumed_correct_answer"]
).astype(int)

df[["question_id", "user_answer", "assumed_correct_answer", "is_correct"]].head()


Unnamed: 0,question_id,user_answer,assumed_correct_answer,is_correct
0,q5012,b,b,1
1,q4706,c,c,1
2,q4366,b,b,1
3,q4829,a,c,0
4,q6528,b,d,0


## Question-Level Features

We compute features that describe how difficult each question is,
based on student performance and time spent.


In [19]:
question_success = (
    df.groupby("question_id")["is_correct"]
    .mean()
    .reset_index(name="question_success_rate")
)

question_success.head()

Unnamed: 0,question_id,question_success_rate
0,q1,0.9
1,q10,0.355556
2,q100,1.0
3,q1000,0.679012
4,q10000,0.545455


## Average time per question

In [20]:
avg_time_question = (
    df.groupby("question_id")["elapsed_time_seconds"]
    .mean()
    .reset_index(name="avg_time_per_question")
)

avg_time_question.head()

Unnamed: 0,question_id,avg_time_per_question
0,q1,0.7
1,q10,0.692222
2,q100,0.7
3,q1000,0.7
4,q10000,0.7


In [21]:
df = df.merge(question_success, on="question_id")
df = df.merge(avg_time_question, on="question_id")

## User-Level Features

These features capture student behavior and overall performance,
helping distinguish question difficulty from student ability.


## User Accuracy

In [22]:
user_accuracy = (
    df.groupby("user_id")["is_correct"]
    .mean()
    .reset_index(name="user_accuracy")
)

df = df.merge(user_accuracy, on="user_id")


## Average Time per User

In [23]:
avg_user_time = (
    df.groupby("user_id")["elapsed_time_seconds"]
    .mean()
    .reset_index(name="avg_user_time")
)

df = df.merge(avg_user_time, on="user_id")

## Attempt Order

This feature represents the order of attempts for each user,
which may capture learning or fatigue effects over time.


In [24]:
df = df.sort_values(["user_id", "timestamp"])
df["attempt_order"] = df.groupby("user_id").cumcount() + 1

## Difficulty Proxy

We define a synthetic difficulty score combining:
- low success rate
- high average solving time

Higher values indicate harder questions.


In [25]:
df["difficulty_proxy"] = (
    (1 - df["question_success_rate"]) *
    df["avg_time_per_question"]
)

In [26]:
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds,assumed_correct_answer,is_correct,question_success_rate,avg_time_per_question,user_accuracy,avg_user_time,attempt_order,difficulty_proxy
0,2019-08-06 12:56:30.868000+00:00,1,q5012,b,u1,0.7,b,1,0.4,0.7,0.721811,0.699906,1,0.42
1,2019-08-06 12:57:01.062000+00:00,2,q4706,c,u1,0.7,c,1,0.857143,0.7,0.721811,0.699906,2,0.1
2,2019-08-06 12:58:13.432000+00:00,3,q4366,b,u1,0.7,b,1,0.5,0.7,0.721811,0.699906,3,0.35
3,2019-08-06 12:58:59.668000+00:00,4,q4829,a,u1,0.7,c,0,0.608696,0.7,0.721811,0.699906,4,0.273913
4,2019-08-06 13:00:01.774000+00:00,5,q6528,b,u1,0.7,d,0,0.431818,0.7,0.721811,0.699906,5,0.397727


## Feature Selection and Column Removal

- Identifier columns (IDs) are removed to prevent memorization.
- Raw interaction columns used only for feature construction are dropped.
- Only numerical, behavior-representative features are retained.


In [27]:
columns_to_drop = [
    "solving_id",
    "user_id",
    "question_id",
    "user_answer",
    "assumed_correct_answer",
    "timestamp"
]

df_model = df.drop(columns=columns_to_drop)

df_model.head()


Unnamed: 0,elapsed_time_seconds,is_correct,question_success_rate,avg_time_per_question,user_accuracy,avg_user_time,attempt_order,difficulty_proxy
0,0.7,1,0.4,0.7,0.721811,0.699906,1,0.42
1,0.7,1,0.857143,0.7,0.721811,0.699906,2,0.1
2,0.7,1,0.5,0.7,0.721811,0.699906,3,0.35
3,0.7,0,0.608696,0.7,0.721811,0.699906,4,0.273913
4,0.7,0,0.431818,0.7,0.721811,0.699906,5,0.397727


## Final Model-Ready Features

The resulting dataset contains only numerical features representing:
- correctness
- time behavior
- question difficulty
- user performance

## Save Featured Dataset

In [28]:
df_model.to_csv("../Data/Featured/featured_data.csv", index=False)
