# Import libraries

In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score,roc_curve
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import joblib

2023-07-10 23:07:38.765749: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Data

In [7]:
df_assignment_rel=pd.read_csv('./data/assignment_relationships.csv')
df_training=pd.read_csv('./data/training_unit_test_scores.csv')
df_eval=pd.read_csv('./data/evaluation_unit_test_scores.csv')
df_actionlogs=pd.read_csv('./data/files/action_logs.csv')
df_assignment=pd.read_csv('./data/assignment_details.csv')
df_sequence=pd.read_csv('./data/sequence_details.csv')
df_problem=pd.read_csv('./data/problem_details.csv')
df_total=pd.read_csv('./data/unit_test_scores.csv')

# Feature Engineering

In [8]:
#combine train and evaluation data to get features
df_final=pd.concat([df_training,df_eval[['assignment_log_id', 'problem_id', 'score']]])
# get related in-unit assignment log ids using assignment relationships
df_final=df_final.merge(df_assignment_rel,left_on='assignment_log_id',right_on='unit_test_assignment_log_id',how='left')
#filter required features
df_final=df_final[['unit_test_assignment_log_id','problem_id', 'score', 'in_unit_assignment_log_id']]

## Actionlog features

In [9]:
# here we want to take mode(the most repeated value) for these 3 columns
def get_max_value_counts(df, group_cols, count_col):
    counts = df.groupby(group_cols)[count_col].value_counts()
    max_indices = counts.groupby(group_cols).idxmax().reset_index(name=count_col)
    max_indices[count_col] = max_indices[count_col].str[2]
    return max_indices

In [10]:
def preprocess_action_logs(df_actionlogs, df_final):
    df_actionlogs['max_attempts']=df_actionlogs.groupby('problem_id').max_attempts.transform('first')
    df_actionlogs['available_core_tutoring']=df_actionlogs.groupby('problem_id').available_core_tutoring.transform('first')
    df_actionlogs['score_viewable']=df_actionlogs.groupby('problem_id').score_viewable.transform('first')
    df_actionlogs['continuous_score_viewable']=df_actionlogs.groupby('problem_id').continuous_score_viewable.transform('first')
    # Fill null values with -1
    df_actionlogs['max_attempts'] = df_actionlogs['max_attempts'].fillna(-1)
    df_actionlogs['score_viewable'] = df_actionlogs['score_viewable'].fillna(-1)
    df_actionlogs['continuous_score_viewable'] = df_actionlogs['continuous_score_viewable'].fillna(-1)
    
    # Get one-hot encoding for 'action' and 'available_core_tutoring' features
    one_hot = pd.get_dummies(df_actionlogs[['action', 'available_core_tutoring']])
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    df_actionlogs = pd.concat([df_actionlogs, one_hot], axis=1)
    
    # Drop unimportant features
    df_actionlogs = df_actionlogs.drop(columns=['timestamp', 'problem_id', 'available_core_tutoring', 'action', 'hint_id', 'explanation_id'])
    
    # Merge action_logs with training data
    df_final = df_final.merge(df_actionlogs, left_on='in_unit_assignment_log_id', right_on='assignment_log_id', how='left')
    
    # Group the data for each combination of 'unit_test_assignment_log_id', 'problem_id'
    df_final_action = df_final.groupby(['unit_test_assignment_log_id', 'problem_id'], as_index=False).agg({
        'action_answer_requested': 'sum', 'action_assignment_finished': 'sum',
        'action_assignment_resumed': 'sum', 'action_assignment_started': 'sum',
        'action_continue_selected': 'sum', 'action_correct_response': 'sum',
        'action_explanation_requested': 'sum', 'action_hint_requested': 'sum',
        'action_live_tutor_requested': 'sum', 'action_open_response': 'sum',
        'action_problem_finished': 'sum', 'action_problem_started': 'sum',
        'action_skill_related_video_requested': 'sum', 'action_wrong_response': 'sum',
        'available_core_tutoring_answer': 'sum', 'available_core_tutoring_explanation': 'sum',
        'available_core_tutoring_hint': 'sum', 'available_core_tutoring_no_tutoring': 'sum',
        'score': 'first'
    })
    
    # Get the mode (most repeated value) for these 3 columns
    df_final_max = get_max_value_counts(df_final, ['unit_test_assignment_log_id', 'problem_id'], 'max_attempts')
    df_final_scoreview = get_max_value_counts(df_final, ['unit_test_assignment_log_id', 'problem_id'], 'score_viewable')
    df_final_conscoreview = get_max_value_counts(df_final, ['unit_test_assignment_log_id', 'problem_id'], 'continuous_score_viewable')
    
    # Merge the updated columns with df_final
    df_final = pd.merge(
        pd.merge(pd.merge(df_final_action, df_final_max, on=['unit_test_assignment_log_id', 'problem_id']),
                 df_final_scoreview, on=['unit_test_assignment_log_id', 'problem_id']),
        df_final_conscoreview, on=['unit_test_assignment_log_id', 'problem_id']
    )
    
    return df_final


In [11]:
df_final=preprocess_action_logs(df_actionlogs, df_final)

## Assignment Details features

In [12]:
def merge_assignment_data(df_final, df_assignment, df_assignment_rel):
    # Assign a column indicating unfinished assignments
    df_assignment["notfinish"] = df_assignment['assignment_end_time'].isnull().astype(int)

    # Merge assignment data with assignment relation data
    ad_ar = df_assignment_rel.merge(df_assignment, how='left', left_on='in_unit_assignment_log_id', right_on='assignment_log_id')

    # Calculate the total assignment count for each unit test assignment
    assignment_total = ad_ar[['unit_test_assignment_log_id', 'in_unit_assignment_log_id']]
    assignment_total = assignment_total.groupby('unit_test_assignment_log_id')['in_unit_assignment_log_id'].nunique().rename('Total_Assignment_Count')

    # Calculate the count of unfinished in-unit assignments for each unit test assignment
    notfinish = ad_ar[['unit_test_assignment_log_id', 'in_unit_assignment_log_id', 'notfinish']]
    notfinish = notfinish.groupby('unit_test_assignment_log_id').sum()

    # Merge the counts and calculate the percentage of unfinished in-unit assignments
    notfinish = notfinish.merge(assignment_total, how="left", left_index=True, right_index=True)
    notfinish['notfinish_percent'] = notfinish['notfinish'] / notfinish['Total_Assignment_Count']
    notfinish['notfinish_percent'] = notfinish['notfinish_percent'].round(4)

    # Merge assignment data with df_final
    df_assignment = df_assignment[['assignment_log_id', 'sequence_id']]
    df_final = df_final.merge(df_assignment, left_on='unit_test_assignment_log_id', right_on='assignment_log_id', how='left')

    # Merge the not_finished percent
    df_final = df_final.merge(notfinish['notfinish_percent'], how='left', left_on='assignment_log_id', right_index=True)

    # Drop unnecessary columns
    df_final = df_final.drop(columns=['assignment_log_id'])

    return df_final


In [13]:
df_final=merge_assignment_data(df_final, df_assignment, df_assignment_rel)

## Sequence Details features

In [14]:
def merge_sequence_data(df_final, df_sequence):
    df_sequence=df_sequence[['sequence_id', 'sequence_folder_path_level_1',
       'sequence_folder_path_level_2','sequence_folder_path_level_3',
       'sequence_folder_path_level_4']]
    # Merge sequence data with training data
    df_final = df_final.merge(df_sequence, on='sequence_id', how='left')

    # Perform one-hot encoding on sequence folder path levels
    one_hot = pd.get_dummies(df_final[['sequence_folder_path_level_1', 'sequence_folder_path_level_2', 'sequence_folder_path_level_3', 'sequence_folder_path_level_4']])

    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    df_final = pd.concat([df_final, one_hot], axis=1)

    # Drop unnecessary columns
    df_final = df_final.drop(columns=['sequence_folder_path_level_1', 'sequence_folder_path_level_2', 'sequence_folder_path_level_3', 'sequence_folder_path_level_4'])
    df_final = df_final.drop(columns=['sequence_id'])

    return df_final


In [15]:
df_final=merge_sequence_data(df_final, df_sequence)

## Problem features

In [16]:

# Define a function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and symbols
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [17]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer

def merge_problem_data(df_final, df_problem):
    # Merge the dataframes
    df_final = df_final.merge(df_problem, on='problem_id', how='left')

    # Split problem_skill_code into separate columns
    df_final[['skill_code1', 'skill_code2', 'skill_code3', 'skill_code4']] = df_final.problem_skill_code.str.split('.', expand=True)

    # Perform one-hot encoding on selected columns
    one_hot = pd.get_dummies(df_final[['problem_type', 'problem_multipart_position', 'skill_code1', 'skill_code2', 'skill_code3', 'skill_code4']])

    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    df_final = pd.concat([df_final, one_hot], axis=1)

    # Split problem_text_bert_pca into separate columns

    df_final['problem_text_bert_pca']=df_final['problem_text_bert_pca'].str.replace('[','').str.replace(']','')
    df_final[['problem_text_bert_pca_' + str(i) for i in range(32)]]=df_final['problem_text_bert_pca'].str.split(',',expand=True)
    df_final[['problem_text_bert_pca_' + str(i) for i in range(32)]]=df_final[['problem_text_bert_pca_' + str(i) for i in range(32)]].astype(float)
    # Fill missing values and clean problem_skill_description
    df_final.problem_skill_description = df_final.problem_skill_description.fillna('NA')
    df_final.problem_skill_description = df_final.problem_skill_description.apply(clean_text)

    # Load the BERT model
    model = SentenceTransformer('bert-base-uncased')

    # Encode the text data using BERT
    embeddings_skill = model.encode(df_final.problem_skill_description.tolist())

    # Apply dimensionality reduction using PCA to reduce the embeddings to 32 dimensions
    pca = PCA(n_components=32)
    reduced_embeddings_skill = pca.fit_transform(embeddings_skill)

    # Scale the reduced embeddings using StandardScaler
    scaler = StandardScaler()
    scaled_embeddings_skill = scaler.fit_transform(reduced_embeddings_skill)

    # Create a DataFrame from scaled_embeddings_skill[:, :32]
    embeddings_df = pd.DataFrame(scaled_embeddings_skill[:, :32])

    # Assign the DataFrame to desired columns in df_problem
    df_final[[f"problem_skill_description{i}" for i in range(32)]] = embeddings_df

    # Perform frequency encoding on problem_multipart_id
    frequency_encoding = df_final['problem_multipart_id'].value_counts()

    # Create a dictionary to map the multipart ID to its frequency
    frequency_map = frequency_encoding.to_dict()

    # Replace the multipart ID column with its corresponding frequency values
    df_final['Multipart_ID_Frequency'] = df_final['problem_multipart_id'].map(frequency_map)

    # Drop unnecessary columns
    df_final = df_final.drop(columns=['problem_multipart_id', 'problem_multipart_position', 'problem_type', 'problem_skill_code', 'problem_skill_description', 'problem_text_bert_pca', 'skill_code1', 'skill_code2', 'skill_code3', 'skill_code4'])

    return df_final


In [18]:
df_final=merge_problem_data(df_final, df_problem)

  df_final['problem_text_bert_pca']=df_final['problem_text_bert_pca'].str.replace('[','').str.replace(']','')
No sentence-transformers model found with name /home/aswani/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/aswani/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the chec

## Training and evaluation data

In [None]:
def get_eval_train_data(df_final, df_total,df_eval):
    # Filter rows with missing 'score' values
    df_eval_final = df_final[df_final['score'].isna()]

    # Merge with df_eval to get relevant columns
    df_eval_final = df_eval_final.merge(df_eval[['id', 'assignment_log_id', 'problem_id']],
                                        left_on=['unit_test_assignment_log_id', 'problem_id'],
                                        right_on=['assignment_log_id', 'problem_id'])

    # Merge with df_total to get 'score' values
    df_eval_final = df_eval_final.merge(df_total[['assignment_log_id', 'problem_id', 'score']],
                                        left_on=['unit_test_assignment_log_id', 'problem_id'],
                                        right_on=['assignment_log_id', 'problem_id'])


    # Filter rows with non-missing 'score' values
    df_training = df_final.dropna(subset=['score'])

    # Drop irrelevant columns
    df_training = df_training.drop(columns=['unit_test_assignment_log_id', 'problem_id'])

    # Fill missing values with -1
    df_training = df_training.fillna(-1)

    # Drop irrelevant columns from df_eval
    df_eval_final = df_eval_final.drop(columns=['unit_test_assignment_log_id', 'problem_id'])

    # Fill missing values with -1
    df_eval_final = df_eval_final.fillna(-1)

    # Drop unnecessary columns
    df_eval_final = df_eval_final.drop(columns=['assignment_log_id_x', 'assignment_log_id_y', 'score_x'])

    # Rename column 'score_y' to 'score'
    df_eval_final = df_eval_final.rename(columns={'score_y': 'score'})

    # Drop column 'id'
    df_eval_final = df_eval_final.drop(columns=['id'])

    return df_eval_final, df_training



In [None]:
df_eval, df_training=get_eval_train_data(df_final, df_total,df_eval)

In [114]:
df_eval.shape

(124455, 404)

In [115]:
df_training.shape

(452439, 404)

In [None]:
#save files
pd.to_pickle('./saved_files/train_data.csv')
pd.to_pickle('./saved_files/eval_data.csv')