In [119]:
import pandas as pd
import numpy as np
df = pd.read_csv("../Data/Cleaned/cleaned_data.csv")
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds
0,2019-08-06 12:56:30.868,1,q5012,b,u1,38.0
1,2019-08-06 12:57:01.062,2,q4706,c,u1,24.0
2,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0
3,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0
4,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0


## 1. Infer Correct Answers
```
The dataset does not contain ground-truth correct answers.
We infer them using majority voting, assuming the most selected answer
represents the correct one.
```

In [120]:
answer_counts = df.groupby(['question_id', 'user_answer']).size()

correct_answers = (
    answer_counts
    .groupby(level=0)
    .idxmax()
    .apply(lambda x: x[1])
    .reset_index(name='assumed_correct_answer')
)

## 2. Control Label Noise (Minimum Attempts Filter)
```
Questions with very few attempts produce unreliable inferred answers.
We keep only questions with sufficient attempts.

In [121]:
attempts_per_question = df['question_id'].value_counts()

valid_questions = attempts_per_question[
    attempts_per_question >= 10
].index

df = df[df['question_id'].isin(valid_questions)]

## 3. Create Correctness Indicator (is_correct)
```
A binary feature indicating whether the user's answer matches
the inferred correct answer.

In [122]:
df = df.merge(correct_answers, on='question_id', how='left')

df['is_correct'] = (
    df['user_answer'] == df['assumed_correct_answer']
).astype(int)


## 4. Aggregate Data to Question Level
```
Behavioral features are aggregated per question to reduce noise.

In [123]:
question_df = df.groupby('question_id').agg(
    attempts=('user_id', 'count'),
    success_rate=('is_correct', 'mean'),
    avg_time=('elapsed_time_seconds', 'mean'),
    median_time=('elapsed_time_seconds', 'median'),
    time_std=('elapsed_time_seconds', 'std')
).reset_index()


# 5. Engineer Safe Question-Level Features
```
These features describe question behavior without encoding difficulty directly.

In [124]:
question_df['log_attempts'] = np.log1p(question_df['attempts'])

question_df['time_skew'] = (
    question_df['avg_time'] - question_df['median_time']
)

question_df['relative_time_variance'] = (
    question_df['time_std'] / question_df['avg_time']
)


## 7. Define Difficulty Target (LAST STEP)
```
Difficulty is defined only after feature creation.

In [125]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

question_df[['success_z', 'time_z']] = scaler.fit_transform(
    question_df[['success_rate', 'avg_time']]
)

question_df['difficulty'] = (
    (-question_df['success_z']) + question_df['time_z']
)


In [126]:
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds,assumed_correct_answer,is_correct
0,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0,b,1
1,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0,c,0
2,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0,d,0
3,2019-08-06 13:01:03.370,6,q4793,a,u1,58.0,b,0
4,2019-08-06 13:01:41.746,7,q6488,a,u1,35.0,a,1


---------------

In [127]:
# low = question_df['difficulty'].quantile(0.35)
# high = question_df['difficulty'].quantile(0.65)

In [128]:
# threshold = question_df['difficulty'].mean()
threshold = 1.0

question_df['difficulty_class'] = np.where(
    question_df['difficulty'] <= threshold,
    'Easy',
    'Hard'
)


-------------------------

In [129]:
# question_df['difficulty_class'] = pd.qcut(
#     question_df['difficulty'],
#     q=3,    
#     labels=['Easy', 'Medium', 'Hard']
# )

In [130]:
# question_df['difficulty_binary'] = question_df['difficulty_class'].map(
#     {
#         'Easy': 'Easy',
#         'Hard': 'Hard'
#     }
# )


In [131]:
# question_df = question_df.dropna(subset=['difficulty_binary'])


In [132]:
question_df

Unnamed: 0,question_id,attempts,success_rate,avg_time,median_time,time_std,log_attempts,time_skew,relative_time_variance,success_z,time_z,difficulty,difficulty_class
0,q1,20,0.900000,20.650000,19.0,8.493186,3.044522,1.650000,0.411292,1.061070,-0.422328,-1.483398,Easy
1,q10,89,0.359551,23.213483,23.0,5.142231,4.499810,0.213483,0.221519,-2.213345,-0.167095,2.046250,Hard
2,q100,24,1.000000,18.750000,19.0,3.010850,3.218876,-0.250000,0.160579,1.666939,-0.611501,-2.278441,Easy
3,q1000,81,0.679012,16.691358,16.0,4.002006,4.406719,0.691358,0.239765,-0.277826,-0.816470,-0.538644,Easy
4,q10000,22,0.545455,33.772727,27.0,19.306875,3.135494,6.772727,0.571671,-1.087011,0.884235,1.971245,Hard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8029,q9995,15,0.666667,31.000000,28.0,19.522880,2.772589,3.000000,0.629770,-0.352624,0.608168,0.960793,Easy
8030,q9996,14,0.857143,25.214286,20.0,16.618704,2.708050,5.214286,0.659099,0.801412,0.032114,-0.769298,Easy
8031,q9997,13,0.384615,30.307692,28.0,15.282804,2.639057,2.307692,0.504255,-2.061485,0.539239,2.600724,Hard
8032,q9998,16,0.687500,20.937500,18.0,10.945128,2.833213,2.937500,0.522752,-0.226401,-0.393703,-0.167302,Easy


In [133]:
# question_df.drop(columns=["difficulty_binary"],inplace=True)

## Save Feature-Enriched Dataset

In [134]:
question_df.to_csv("../Data/Featured/featured_data.csv", index=False)