In [252]:
import pandas as pd
import numpy as np
df = pd.read_csv("../Data/Cleaned/cleaned_data.csv")
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds
0,2019-08-06 12:56:30.868,1,q5012,b,u1,38.0
1,2019-08-06 12:57:01.062,2,q4706,c,u1,24.0
2,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0
3,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0
4,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0


## 1. Infer Correct Answers
```
The dataset does not contain ground-truth correct answers.
We infer them using majority voting, assuming the most selected answer
represents the correct one.
```

In [253]:
answer_counts = df.groupby(['question_id', 'user_answer']).size()

correct_answers = (
    answer_counts
    .groupby(level=0)
    .idxmax()
    .apply(lambda x: x[1])
    .reset_index(name='assumed_correct_answer')
)

## 2. Control Label Noise (Minimum Attempts Filter)
```
Questions with very few attempts produce unreliable inferred answers.
We keep only questions with sufficient attempts.

In [254]:
attempts_per_question = df['question_id'].value_counts()

valid_questions = attempts_per_question[
    attempts_per_question >= 10
].index

df = df[df['question_id'].isin(valid_questions)]

## 3. Create Correctness Indicator (is_correct)
```
A binary feature indicating whether the user's answer matches
the inferred correct answer.

In [255]:
df = df.merge(correct_answers, on='question_id', how='left')

df['is_correct'] = (
    df['user_answer'] == df['assumed_correct_answer']
).astype(int)


## 4. Aggregate Data to Question Level
```
Behavioral features are aggregated per question to reduce noise.

In [256]:
question_df = df.groupby('question_id').agg(
    attempts=('user_id', 'count'),
    success_rate=('is_correct', 'mean'),
    avg_time=('elapsed_time_seconds', 'mean'),
    median_time=('elapsed_time_seconds', 'median'),
    time_std=('elapsed_time_seconds', 'std')
).reset_index()


# 5. Engineer Safe Question-Level Features
```
These features describe question behavior without encoding difficulty directly.

In [257]:
question_df['log_attempts'] = np.log1p(question_df['attempts'])

question_df['time_skew'] = (
    question_df['avg_time'] - question_df['median_time']
)

question_df['relative_time_variance'] = (
    question_df['time_std'] / question_df['avg_time']
)


## 7. Define Difficulty Target (LAST STEP)
```
Difficulty is defined only after feature creation.

In [258]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

question_df[['success_z', 'time_z']] = scaler.fit_transform(
    question_df[['success_rate', 'avg_time']]
)

question_df['difficulty'] = (
    (-question_df['success_z']) + question_df['time_z']
)


In [259]:
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds,assumed_correct_answer,is_correct
0,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0,b,1
1,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0,c,0
2,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0,d,0
3,2019-08-06 13:01:03.370,6,q4793,a,u1,58.0,b,0
4,2019-08-06 13:01:41.746,7,q6488,a,u1,35.0,a,1


In [260]:
question_df['difficulty_class'] = pd.qcut(
    question_df['difficulty'],
    q=3,    
    labels=['Easy', 'Medium', 'Hard']
)

In [261]:
question_df['difficulty_binary'] = question_df['difficulty_class'].map(
    {
        'Easy': 'Easy',
        'Hard': 'Hard'
    }
)
question_df = question_df.dropna(subset=['difficulty_binary'])


In [262]:
question_df.drop(columns=["difficulty_binary"],inplace=True)

## Save Feature-Enriched Dataset

In [263]:
question_df.to_csv("../Data/Featured/featured_data.csv", index=False)