In [1]:
import pandas as pd
merged = pd.read_csv('plogs_with_metadata.csv')

print(merged.info())
print(merged.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17476374 entries, 0 to 17476373
Data columns (total 19 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   log_id                  int64  
 1   student_id              int64  
 2   assignment_id           int64  
 3   problem_id              int64  
 4   start_time              object 
 5   time_on_task            float64
 6   answer_before_tutoring  object 
 7   fraction_of_hints_used  float64
 8   attempt_count           int64  
 9   answer_given            bool   
 10  problem_completed       bool   
 11  correct                 object 
 12  content_source          object 
 13  skills                  object 
 14  problem_type            object 
 15  tutoring_types          object 
 16  student_answer_count    float64
 17  mean_correct            float64
 18  mean_time_on_task       float64
dtypes: bool(2), float64(5), int64(5), object(7)
memory usage: 2.2+ GB
None
    log_id  student_id  assignme

In [2]:
merged.isnull().sum().sort_values(ascending=False)

skills                    9012640
fraction_of_hints_used    8925835
tutoring_types            7038373
correct                   4732693
mean_correct              4575934
time_on_task               311345
answer_before_tutoring     283759
mean_time_on_task           48823
student_answer_count        19860
problem_type                19860
content_source              19860
log_id                          0
problem_completed               0
student_id                      0
attempt_count                   0
start_time                      0
problem_id                      0
assignment_id                   0
answer_given                    0
dtype: int64

In [3]:
import pandas as pd
import numpy as np
import ast

df = merged.copy()

# 1. REMOVE logs with *missing skill or correct outcome* (required for topic-learning analysis!)
df = df.dropna(subset=['skills', 'correct'])

# 2. Parse skills into a list, then explode so each row = a single skill attempt
def safe_parse(x):
    try: return ast.literal_eval(x) if isinstance(x,str) else []
    except: return []
df['skills_list'] = df['skills'].map(safe_parse)
df = df.explode('skills_list').rename(columns={'skills_list':'skill'})
df = df.dropna(subset=['skill']) # Remove logs where parsed skill is missing

# 3. Convert correct to binary
df['correct'] = df['correct'].map({True: 1, False: 0})

# 4. time_on_task: fill missing with median, cap large outliers (e.g., 20 min = 1200s)
df['time_on_task'] = df['time_on_task'].fillna(df['time_on_task'].median())
df['time_on_task'] = np.clip(df['time_on_task'], 1, 1200)

# 5. fraction_of_hints_used: fill missing with 0
df['fraction_of_hints_used'] = df['fraction_of_hints_used'].fillna(0.0)

# 6. answer_before_tutoring, answer_given, problem_completed: convert to binary numeric
def to_binary(col):
    return col.map({True:1, False:0, 'True':1, 'False':0}).fillna(0)

df['answer_before_tutoring'] = to_binary(df['answer_before_tutoring'])
df['answer_given'] = to_binary(df['answer_given'])
df['problem_completed'] = to_binary(df['problem_completed'])

# 7. problem_type, content_source: Fill NA as "Unknown" if categorical
for col in ['problem_type', 'content_source', 'tutoring_types']:
    df[col] = df[col].fillna('Unknown')
    df[col] = df[col].astype('category')

# 8. Optional: hist. stats (mean_correct, mean_time_on_task).
# Fill missing with mean of column, else leave as is depending on your model
df['mean_correct'] = df['mean_correct'].fillna(df['mean_correct'].mean())
df['mean_time_on_task'] = df['mean_time_on_task'].fillna(df['mean_time_on_task'].mean())
df['student_answer_count'] = df['student_answer_count'].fillna(0).astype(int)

print("Rows after cleaning:", len(df))
df.info()

Rows after cleaning: 7106683
<class 'pandas.core.frame.DataFrame'>
Index: 7106683 entries, 0 to 17476372
Data columns (total 20 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   log_id                  int64   
 1   student_id              int64   
 2   assignment_id           int64   
 3   problem_id              int64   
 4   start_time              object  
 5   time_on_task            float64 
 6   answer_before_tutoring  float64 
 7   fraction_of_hints_used  float64 
 8   attempt_count           int64   
 9   answer_given            int64   
 10  problem_completed       int64   
 11  correct                 int64   
 12  content_source          category
 13  skills                  object  
 14  problem_type            category
 15  tutoring_types          category
 16  student_answer_count    int64   
 17  mean_correct            float64 
 18  mean_time_on_task       float64 
 19  skill                   object  
dtypes: category(3), float

In [4]:
df.isnull().sum().sort_values(ascending=False)

log_id                    0
student_id                0
mean_time_on_task         0
mean_correct              0
student_answer_count      0
tutoring_types            0
problem_type              0
skills                    0
content_source            0
correct                   0
problem_completed         0
answer_given              0
attempt_count             0
fraction_of_hints_used    0
answer_before_tutoring    0
time_on_task              0
start_time                0
problem_id                0
assignment_id             0
skill                     0
dtype: int64

In [5]:
df.to_csv('plogs_with_metadata_cleaned.csv', index=False)

In [6]:
import pandas as pd

df = pd.read_csv("plogs_with_metadata_cleaned.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7106683 entries, 0 to 7106682
Data columns (total 20 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   log_id                  int64  
 1   student_id              int64  
 2   assignment_id           int64  
 3   problem_id              int64  
 4   start_time              object 
 5   time_on_task            float64
 6   answer_before_tutoring  float64
 7   fraction_of_hints_used  float64
 8   attempt_count           int64  
 9   answer_given            int64  
 10  problem_completed       int64  
 11  correct                 int64  
 12  content_source          object 
 13  skills                  object 
 14  problem_type            object 
 15  tutoring_types          object 
 16  student_answer_count    int64  
 17  mean_correct            float64
 18  mean_time_on_task       float64
 19  skill                   object 
dtypes: float64(5), int64(9), object(6)
memory usage: 1.1+ GB
None
    lo

In [7]:
for col in ['student_id', 'problem_id', 'skill', 'problem_type', 'content_source', 'tutoring_types']:
    print(f"{col}: {df[col].nunique()} unique values")

student_id: 174906 unique values
problem_id: 42680 unique values
skill: 372 unique values
problem_type: 10 unique values
content_source: 15 unique values
tutoring_types: 8 unique values


In [8]:
for col in ['student_id', 'problem_id', 'assignment_id', 'skill', 'problem_type', 'content_source', 'tutoring_types']:
    if col in df.columns:
        df[col] = df[col].astype('category')