In [None]:
import pandas as pd
import numpy as np

In [29]:
df = pd.read_csv("../Data/Raw/Raw_Data.csv")
df.head()


Unnamed: 0,timestamp,solving_id,question_id,user_answer,elapsed_time,user_id
0,1565096190868,1,q5012,b,38000,u1
1,1565096221062,2,q4706,c,24000,u1
2,1565096293432,3,q4366,b,68000,u1
3,1565096339668,4,q4829,a,42000,u1
4,1565096401774,5,q6528,b,59000,u1


In [30]:
df.shape
df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260019 entries, 0 to 260018
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   timestamp     260019 non-null  int64 
 1   solving_id    260019 non-null  int64 
 2   question_id   260019 non-null  object
 3   user_answer   260008 non-null  object
 4   elapsed_time  260019 non-null  int64 
 5   user_id       260019 non-null  object
dtypes: int64(3), object(3)
memory usage: 11.9+ MB


## 1.1 Convert Timestamp
The timestamp column is stored as Unix time in milliseconds.

In [31]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

## 1.2 Convert Elapsed Time to Seconds

Elapsed time is converted from milliseconds to seconds.

In [32]:
df['elapsed_time_seconds'] = df['elapsed_time'] / 1000
df.drop(columns=['elapsed_time'], inplace=True)

## 1.3 Ensure Correct String Types

In [33]:
df['question_id'] = df['question_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)
df['user_answer'] = df['user_answer'].astype(str)


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260019 entries, 0 to 260018
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   timestamp             260019 non-null  datetime64[ns]
 1   solving_id            260019 non-null  int64         
 2   question_id           260019 non-null  object        
 3   user_answer           260019 non-null  object        
 4   user_id               260019 non-null  object        
 5   elapsed_time_seconds  260019 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 11.9+ MB


In [35]:
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds
0,2019-08-06 12:56:30.868,1,q5012,b,u1,38.0
1,2019-08-06 12:57:01.062,2,q4706,c,u1,24.0
2,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0
3,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0
4,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0


## 2. Handle Missing Values

Missing values in core behavioral fields cannot be inferred reliably.

In [36]:
df.isna().sum()

timestamp               0
solving_id              0
question_id             0
user_answer             0
user_id                 0
elapsed_time_seconds    0
dtype: int64

In [37]:
df.dropna(inplace=True)

## 3. Remove Duplicate Interactions

Duplicate records artificially inflate attempts and bias difficulty estimation.

A duplicate is defined as:

+ Same user

+ Same question

+ Same timestamp

In [38]:
df = df.drop_duplicates(
    subset=['user_id', 'question_id', 'timestamp']
)

## 4. Standardize User Answers

User answers may contain inconsistent formatting.

In [39]:
df['user_answer'] = (
    df['user_answer']
    .str.lower()
    .str.strip()
)

In [40]:
df['user_answer'].value_counts()

user_answer
b      74194
a      70325
c      69027
d      46460
nan       11
Name: count, dtype: int64

# Handle Invalid and Extreme Time Values
## 5.1 Remove Zero or Negative Time Values

In [41]:
df = df[df['elapsed_time_seconds'] > 0]

## 5.2 Remove Unrealistically Fast Answers

Answers faster than 1 second are considered invalid.

In [42]:
df = df[df['elapsed_time_seconds'] >= 1]

## 5.3 Remove Extreme Slow Outliers (Top 1%)

Very large time values introduce noise.

In [43]:
upper_limit = df['elapsed_time_seconds'].quantile(0.99)
df = df[df['elapsed_time_seconds'] <= upper_limit]

# 6. Validate User and Question IDs

Ensure IDs follow the expected format.

## 6.1 Validate Question IDs

In [44]:
df = df[df['question_id'].str.startswith('q')]

## 6.2 Validate User IDs

In [45]:
df = df[df['user_id'].str.startswith('u')]

## 7. Sort Dataset for Behavioral Analysis

Sorting is required for later feature engineering steps

In [46]:
df = df.sort_values(['user_id', 'timestamp'])

In [47]:
df.head()

Unnamed: 0,timestamp,solving_id,question_id,user_answer,user_id,elapsed_time_seconds
0,2019-08-06 12:56:30.868,1,q5012,b,u1,38.0
1,2019-08-06 12:57:01.062,2,q4706,c,u1,24.0
2,2019-08-06 12:58:13.432,3,q4366,b,u1,68.0
3,2019-08-06 12:58:59.668,4,q4829,a,u1,42.0
4,2019-08-06 13:00:01.774,5,q6528,b,u1,59.0


In [48]:
df.describe()


Unnamed: 0,timestamp,solving_id,elapsed_time_seconds
count,256640,256640.0,256640.0
mean,2018-10-24 22:25:29.638843392,943.408911,24.531012
min,2017-11-12 10:02:45.027000,1.0,1.0
25%,2018-02-07 15:15:59.222000128,151.0,15.333
50%,2018-09-02 11:45:04.292500224,521.0,20.0
75%,2019-07-29 09:49:10.191500032,1312.0,29.0
max,2019-12-02 09:47:20.437000,7772.0,108.8
std,,1171.770549,15.837848


In [49]:
df.nunique()

timestamp               254758
solving_id                7766
question_id              11425
user_answer                  5
user_id                   1000
elapsed_time_seconds       962
dtype: int64

## 8. Save Cleaned Dataset

In [50]:
df.to_csv("../Data/Cleaned/cleaned_data.csv", index=False)