In [1]:
import pandas as pd

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
old_df = pd.read_csv("../data/csv_oct09_24/sample_sep_1.csv")
old_df.columns

Index(['Unnamed: 0', 'responses_lo_1.2.institution_id',
       'responses_lo_1.2.class_id', 'responses_lo_1.2.student_id',
       'responses_lo_1.2.course_name', 'responses_lo_1.2.release',
       'responses_lo_1.2.book', 'responses_lo_1.2.response',
       'responses_lo_1.2.points_earned', 'responses_lo_1.2.completes_page',
       'responses_lo_1.2.dt_submitted', 'responses_lo_1.2.attempt',
       'responses_lo_1.2.id_p', 'responses_lo_1.2.chapter',
       'responses_lo_1.2.page', 'responses_lo_1.2.item_id',
       'responses_lo_1.2.item_type', 'responses_lo_1.2.lrn_type',
       'responses_lo_1.2.lrn_question_position',
       'responses_lo_1.2.lrn_question_reference', 'responses_lo_1.2.prompt',
       'responses_lo_1.2.lrn_option_0', 'responses_lo_1.2.lrn_option_1',
       'responses_lo_1.2.lrn_option_2', 'responses_lo_1.2.lrn_option_3',
       'responses_lo_1.2.lrn_option_4', 'responses_lo_1.2.lrn_option_5',
       'responses_lo_1.2.lrn_option_6', 'responses_lo_1.2.lrn_option_7',
 

In [68]:
new_df = pd.read_csv("../data/csv_oct15_24/responses_assessment_level2_3.1.csv")
new_df.columns

Index(['id_p', 'institution_id', 'class_id', 'course_name', 'release', 'book',
       'branch', 'student_id', 'item_id', 'item_type', 'chapter', 'page',
       'response', 'prompt', 'points_possible', 'points_earned',
       'dt_submitted', 'completes_page', 'attempt', 'user_agent',
       'lrn_session_id', 'lrn_response_id', 'lrn_activity_reference',
       'lrn_question_reference', 'lrn_question_position', 'lrn_type',
       'lrn_dt_started', 'lrn_dt_saved', 'lrn_status', 'lrn_response_json',
       'lrn_option_0', 'lrn_option_1', 'lrn_option_2', 'lrn_option_3',
       'lrn_option_4', 'lrn_option_5', 'lrn_option_6', 'lrn_option_7',
       'lrn_option_8', 'lrn_option_9', 'lrn_option_10', 'lrn_option_11',
       'chapter_num', 'page_num', 'level1', 'level2', 'level3', 'level4'],
      dtype='object')

In [69]:
len(new_df)

317633

In [70]:
deduped_new_df = new_df.drop_duplicates()

In [76]:
import pandas as pd
import re
import rich
import os

from pyBKT.models import Model


def _extract_chapter(path: str) -> str:
    """
    Extract the chapter name from the path
    """
    # The path is in the format of responses_assessment_level2_1.1.csv
    # We want to extract the chapter number from the file name, i.e. 1.1 in this case
    chapter_pattern = re.compile(r".*responses_assessment_level2_(\d+\.\d+)\.csv")

    # Find column from data.columns that match the above pattern and extract out the chapter
    match = chapter_pattern.match(path)
    chapter = match.group(1)
    return chapter


def process_data(path: str, output_folder: str) -> pd.DataFrame:
    """
    Process the data from the file specified by the path
    """

    data = pd.read_csv(path, index_col=0, low_memory=False)
    chapter = _extract_chapter(path)

    rich.print(f"[yellow bold]Processing chapter: [/yellow bold]{chapter}")

    if len(data) == 0:
        rich.print("[red bold]No data found, skipped[/red bold]")
        return

    # Define the columns we're interested in
    points_earned_col = f"points_earned"
    submitted_col = f"dt_submitted"
    student_id_col = f"student_id"

    # Run some transformations to get the data into the correct format
    # Drop nan values in the points_earned column
    data = data.dropna(subset=[points_earned_col])

    # Ensure the dt_submitted column is in datetime format
    data[submitted_col] = pd.to_datetime(data[submitted_col])

    # Sort the data by student and submission time to create order_id
    data = data.sort_values(by=[student_id_col, submitted_col])

    # Create order_id (sequence number of observations per student)
    data["order_id"] = data.groupby(student_id_col).cumcount() + 1

    # Rename columns to match pyBKT expected format
    bkt_data = data.rename(
        columns={
            student_id_col: "user_id",
            points_earned_col: "correct",
        }
    )

    # Assign a skill name (you can change this if you're tracking multiple skills)
    bkt_data["skill_name"] = (
        0  # If you're tracking only one skill, this can be constant
    )

    # Drop unnecessary columns
    bkt_data = bkt_data[["user_id", "order_id", "correct", "skill_name"]].reset_index()

    if len(bkt_data) == 0:
        rich.print("[red bold]No BKT data built, skipped[/red bold]")
        return

    # Fit the model
    model = Model(seed=42, num_fits=1)
    model.fit(data=bkt_data)

    # Generate predictions for each attempt
    predictions = model.predict(data=bkt_data)

    # Get the final predictions after all attempts for each student
    final_predictions = predictions.groupby("user_id").last()[
        ["correct_predictions", "state_predictions"]
    ]

    # Rename the aggregated columns
    final_predictions = final_predictions.rename(
        columns={
            "state_predictions": "final_state_predictions",
            "correct_predictions": "final_correct_predictions",
        }
    )

    # Create the folder if it doesn't exist using python
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    final_predictions.to_csv(f"{output_folder}/predictions_{chapter}.csv")

In [80]:
path = "../data/csv_oct15_24/responses_assessment_level2_3.1.csv"

data = pd.read_csv(path, index_col=0, low_memory=False)
chapter = _extract_chapter(path)

rich.print(f"[yellow bold]Processing chapter: [/yellow bold]{chapter}")

if len(data) == 0:
    rich.print("[red bold]No data found, skipped[/red bold]")

# Define the columns we're interested in
points_earned_col = f"points_earned"
submitted_col = f"dt_submitted"
student_id_col = f"student_id"

# Run some transformations to get the data into the correct format
# Drop nan values in the points_earned column
data = data.dropna(subset=[points_earned_col])

# Ensure the dt_submitted column is in datetime format
data[submitted_col] = pd.to_datetime(data[submitted_col])

data = data.drop_duplicates()

# Sort the data by student and submission time to create order_id
data = data.sort_values(by=[student_id_col, submitted_col])

# Create order_id (sequence number of observations per student)
data["order_id"] = data.groupby(student_id_col).cumcount() + 1

# data = data.reset_index()

# Rename columns to match pyBKT expected format
bkt_data = data.rename(
    columns={
        student_id_col: "user_id",
        points_earned_col: "correct",
    }
)

bkt_data = bkt_data.reset_index()

# Assign a skill name (you can change this if you're tracking multiple skills)
bkt_data["skill_name"] = 0  # If you're tracking only one skill, this can be constant

# Drop unnecessary columns
bkt_data = bkt_data[["user_id", "order_id", "correct", "skill_name"]]

In [86]:
bkt_data[bkt_data['user_id'] == '0281beb5-c599-4da7-865f-af93088acfd9']

Unnamed: 0,user_id,order_id,correct,skill_name,correct_predictions,state_predictions
244,0281beb5-c599-4da7-865f-af93088acfd9,1,0,0,0.57407,0.44480
245,0281beb5-c599-4da7-865f-af93088acfd9,2,0,0,0.48510,0.21820
246,0281beb5-c599-4da7-865f-af93088acfd9,3,1,0,0.43450,0.08933
247,0281beb5-c599-4da7-865f-af93088acfd9,4,0,0,0.46380,0.16395
248,0281beb5-c599-4da7-865f-af93088acfd9,5,1,0,0.42488,0.06482
...,...,...,...,...,...,...
283,0281beb5-c599-4da7-865f-af93088acfd9,40,0,0,0.45599,0.14407
284,0281beb5-c599-4da7-865f-af93088acfd9,41,0,0,0.42154,0.05632
285,0281beb5-c599-4da7-865f-af93088acfd9,42,0,0,0.40789,0.02154
286,0281beb5-c599-4da7-865f-af93088acfd9,43,0,0,0.40291,0.00888


In [81]:
data.columns

Index(['institution_id', 'class_id', 'course_name', 'release', 'book',
       'branch', 'student_id', 'item_id', 'item_type', 'chapter', 'page',
       'response', 'prompt', 'points_possible', 'points_earned',
       'dt_submitted', 'completes_page', 'attempt', 'user_agent',
       'lrn_session_id', 'lrn_response_id', 'lrn_activity_reference',
       'lrn_question_reference', 'lrn_question_position', 'lrn_type',
       'lrn_dt_started', 'lrn_dt_saved', 'lrn_status', 'lrn_response_json',
       'lrn_option_0', 'lrn_option_1', 'lrn_option_2', 'lrn_option_3',
       'lrn_option_4', 'lrn_option_5', 'lrn_option_6', 'lrn_option_7',
       'lrn_option_8', 'lrn_option_9', 'lrn_option_10', 'lrn_option_11',
       'chapter_num', 'page_num', 'level1', 'level2', 'level3', 'level4',
       'order_id'],
      dtype='object')

In [87]:
student_data = data[data['student_id'] == '0281beb5-c599-4da7-865f-af93088acfd9'][['student_id', 'points_earned', 'dt_submitted']]
student_data

Unnamed: 0_level_0,student_id,points_earned,dt_submitted
id_p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B1_Review1_09 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:54:37.109867
B1_Review1_11 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:55:11.702449
B1_Review1_12 1,0281beb5-c599-4da7-865f-af93088acfd9,1.00000,2023-12-04 17:55:23.662697
B1_Review1_13 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:55:42.314275
B1_Review2_05 1,0281beb5-c599-4da7-865f-af93088acfd9,1.00000,2023-12-04 18:11:01.373617
...,...,...,...
B4_Review1_16 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:34:34.053801
B4_Review1_19 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:34:52.080108
B4_Review2_06 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:36:11.040208
B4_Review2_10 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:39:23.029598


In [88]:
student_data.drop_duplicates()

Unnamed: 0_level_0,student_id,points_earned,dt_submitted
id_p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B1_Review1_09 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:54:37.109867
B1_Review1_11 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:55:11.702449
B1_Review1_12 1,0281beb5-c599-4da7-865f-af93088acfd9,1.00000,2023-12-04 17:55:23.662697
B1_Review1_13 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2023-12-04 17:55:42.314275
B1_Review2_05 1,0281beb5-c599-4da7-865f-af93088acfd9,1.00000,2023-12-04 18:11:01.373617
...,...,...,...
B4_Review1_16 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:34:34.053801
B4_Review1_19 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:34:52.080108
B4_Review2_06 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:36:11.040208
B4_Review2_10 1,0281beb5-c599-4da7-865f-af93088acfd9,0.00000,2024-03-19 16:39:23.029598


In [82]:
model = Model(seed=42, num_fits=1)

In [83]:
import warnings

# Silence RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

model.fit(data=bkt_data)

In [84]:
prediction_df = model.predict(data=bkt_data)

In [89]:
pd.set_option('display.max_rows', 50)

prediction_df[prediction_df['user_id'] == '0281beb5-c599-4da7-865f-af93088acfd9']

Unnamed: 0,user_id,order_id,correct,skill_name,correct_predictions,state_predictions
244,0281beb5-c599-4da7-865f-af93088acfd9,1,0,0,0.57407,0.4448
245,0281beb5-c599-4da7-865f-af93088acfd9,2,0,0,0.4851,0.2182
246,0281beb5-c599-4da7-865f-af93088acfd9,3,1,0,0.4345,0.08933
247,0281beb5-c599-4da7-865f-af93088acfd9,4,0,0,0.4638,0.16395
248,0281beb5-c599-4da7-865f-af93088acfd9,5,1,0,0.42488,0.06482
249,0281beb5-c599-4da7-865f-af93088acfd9,6,1,0,0.44733,0.122
250,0281beb5-c599-4da7-865f-af93088acfd9,7,0,0,0.48465,0.21706
251,0281beb5-c599-4da7-865f-af93088acfd9,8,0,0,0.43429,0.08879
252,0281beb5-c599-4da7-865f-af93088acfd9,9,1,0,0.41274,0.03392
253,0281beb5-c599-4da7-865f-af93088acfd9,10,1,0,0.42547,0.06633


In [11]:
model.coef_

{'0': {'prior': np.float64(0.2596580956384623),
  'learns': array([0.00368407]),
  'guesses': array([0.36249751]),
  'slips': array([0.17124273]),
  'forgets': array([0.])}}

In [17]:
coefs = []

coef = model.coef_["0"]
coef

{'prior': np.float64(0.259701541449756),
 'learns': array([0.00368414]),
 'guesses': array([0.36248229]),
 'slips': array([0.17126229]),
 'forgets': array([0.])}

In [18]:
float(coef["prior"])

0.259701541449756

In [20]:
coef["learns"][0]

np.float64(0.003684136079713055)

In [21]:
float(coef["learns"][0])

0.003684136079713055

In [23]:
{
    "chapter": chapter,
    "prior": float(coef["prior"]),
    "learns": float(coef["learns"][0]),
    "guesses": float(coef["guesses"][0]),
    "slips": float(coef["slips"][0]),
}

{'chapter': '3.2',
 'prior': 0.259701541449756,
 'learns': 0.003684136079713055,
 'guesses': 0.3624822871232267,
 'slips': 0.17126229068293614}

In [26]:
coef_df = pd.DataFrame(
    [
        {
            "chapter": chapter,
            "prior": float(coef["prior"]),
            "learns": float(coef["learns"][0]),
            "guesses": float(coef["guesses"][0]),
            "slips": float(coef["slips"][0]),
        },
        {
            "chapter": "1.1",
            "prior": float(coef["prior"]),
            "learns": float(coef["learns"][0]),
            "guesses": float(coef["guesses"][0]),
            "slips": float(coef["slips"][0]),
        },
        {
            "chapter": "1.2",
            "prior": float(coef["prior"]),
            "learns": float(coef["learns"][0]),
            "guesses": float(coef["guesses"][0]),
            "slips": float(coef["slips"][0]),
        },
    ]
)

In [27]:
coef_df.sort_values(by="chapter")

Unnamed: 0,chapter,prior,learns,guesses,slips
1,1.1,0.2597,0.00368,0.36248,0.17126
2,1.2,0.2597,0.00368,0.36248,0.17126
0,3.2,0.2597,0.00368,0.36248,0.17126
