In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv
/kaggle/input/make-data-count-pre-downloaded-transformer-model/config.json
/kaggle/input/make-data-count-pre-downloaded-transformer-model/config (1).json
/kaggle/input/make-data-count-pre-downloaded-transformer-model/tokenizer.json
/kaggle/input/make-data-count-pre-downloaded-transformer-model/tokenizer_config.json
/kaggle/input/make-data-count-pre-downloaded-transformer-model/pytorch_model.bin
/kaggle/input/make-data-count-pre-downloaded-transformer-model/special_tokens_map.json


In [2]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from torch import nn
from tqdm.auto import tqdm
import numpy as np
import os
import gc

# Set a random seed for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

In [3]:
# --- 1. Data Preparation and Preprocessing ---

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[math\][^\[]*\[/math\]', ' math_expression ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def prepare_data(df, is_train=True):
    df['QuestionText'] = df['QuestionText'].fillna('')
    df['StudentExplanation'] = df['StudentExplanation'].fillna('')
    df['text'] = 'Question: ' + df['QuestionText'] + ' Explanation: ' + df['StudentExplanation']
    df['text'] = df['text'].apply(clean_text)

    if is_train:
        df['Category'] = df['Category'].fillna('False_Neither')
        df['Misconception'] = df['Misconception'].fillna('NA')

        df['is_correct'] = df['Category'].str.startswith('True').astype(int)
        df['has_misconception'] = (~df['Misconception'].isin(['NA'])).astype(int)
        
        # Combined label for stratified splitting
        df['combined_label'] = df['Category'] + ':' + df['Misconception']
        
        # Encode misconceptions
        misconception_labels = sorted(df.loc[df['Misconception'] != 'NA', 'Misconception'].unique())
        misconception_map = {label: i for i, label in enumerate(misconception_labels)}
        df['misconception_id'] = df['Misconception'].apply(lambda x: misconception_map.get(x, -1))
        
        return df, misconception_map
    
    return df, None

try:
    train_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
    test_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
except FileNotFoundError:
    print("Ensure train.csv and test.csv are in the current directory.")
    exit()

train_df, misconception_map = prepare_data(train_df, is_train=True)
test_df, _ = prepare_data(test_df, is_train=False)

In [4]:
# --- 2. Model Architecture and Dataset Class ---

MODEL_PATH = '/kaggle/input/make-data-count-pre-downloaded-transformer-model'
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
N_SPLITS = 5

class MathDataset(Dataset):
    def __init__(self, df, tokenizer, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['text'])
        
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        if self.is_train:
            is_correct = torch.tensor(row['is_correct'], dtype=torch.long)
            has_misconception = torch.tensor(row['has_misconception'], dtype=torch.long)
            misconception_id = torch.tensor(row['misconception_id'], dtype=torch.long)
            return input_ids, attention_mask, is_correct, has_misconception, misconception_id
        else:
            return input_ids, attention_mask

class MultiTaskModel(nn.Module):
    def __init__(self, pretrained_model, num_correct, num_misconception, num_specific_misconceptions):
        super(MultiTaskModel, self).__init__()
        self.encoder = pretrained_model
        self.correct_head = nn.Linear(self.encoder.config.hidden_size, num_correct)
        self.misconception_head = nn.Linear(self.encoder.config.hidden_size, num_misconception)
        self.specific_misconception_head = nn.Linear(self.encoder.config.hidden_size, num_specific_misconceptions)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]
        correct_logits = self.correct_head(pooled_output)
        misconception_logits = self.misconception_head(pooled_output)
        specific_misconception_logits = self.specific_misconception_head(pooled_output)
        return correct_logits, misconception_logits, specific_misconception_logits

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    pretrained_model = AutoModel.from_pretrained(MODEL_PATH)
except OSError:
    print(f"Pre-trained model '{MODEL_PATH}' not found. Please ensure it is correctly added as a Kaggle dataset.")
    exit()

num_specific_misconceptions = len(misconception_map)
reverse_misconception_map = {i: label for label, i in misconception_map.items()}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertModel were not initialized from the model checkpoint at /kaggle/input/make-data-count-pre-downloaded-transformer-model and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.layer.0.output_layer_norm.weight', 'transformer.layer.0.sa_layer_nor

In [5]:
# --- 3. Training and Validation ---

def train_model(model, data_loader, optimizer, scheduler, correct_criterion, misconception_criterion, specific_criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        input_ids, attention_mask, is_correct, has_misconception, misconception_id = [t.to(device) for t in batch]
        
        optimizer.zero_grad()
        correct_logits, misconception_logits, specific_misconception_logits = model(input_ids, attention_mask)
        
        loss1 = correct_criterion(correct_logits, is_correct)
        loss2 = misconception_criterion(misconception_logits, has_misconception)
        
        mask = has_misconception == 1
        if mask.sum() > 0:
            specific_loss = specific_criterion(specific_misconception_logits[mask], misconception_id[mask])
            loss = loss1 + loss2 + specific_loss
        else:
            loss = loss1 + loss2
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, correct_criterion, misconception_criterion, specific_criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids, attention_mask, is_correct, has_misconception, misconception_id = [t.to(device) for t in batch]
            
            correct_logits, misconception_logits, specific_misconception_logits = model(input_ids, attention_mask)
            
            loss1 = correct_criterion(correct_logits, is_correct)
            loss2 = misconception_criterion(misconception_logits, has_misconception)
            mask = has_misconception == 1
            if mask.sum() > 0:
                specific_loss = specific_criterion(specific_misconception_logits[mask], misconception_id[mask])
                loss = loss1 + loss2 + specific_loss
            else:
                loss = loss1 + loss2

            total_loss += loss.item()
    return total_loss / len(data_loader)

# Total number of logits/outputs from the multi-task model
TOTAL_OUTPUT_DIM = 2 + 2 + num_specific_misconceptions

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
test_dataset = MathDataset(test_df, tokenizer, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
oof_predictions = np.zeros((len(train_df), TOTAL_OUTPUT_DIM))
test_predictions = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['combined_label'])):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    train_fold_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_fold_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    train_dataset = MathDataset(train_fold_df, tokenizer)
    val_dataset = MathDataset(val_fold_df, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = MultiTaskModel(pretrained_model, num_correct=2, num_misconception=2, num_specific_misconceptions=num_specific_misconceptions)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=LR)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    correct_criterion = nn.CrossEntropyLoss()
    misconception_criterion = nn.CrossEntropyLoss()
    specific_criterion = nn.CrossEntropyLoss(ignore_index=-1)

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss = train_model(model, train_loader, optimizer, scheduler, correct_criterion, misconception_criterion, specific_criterion)
        val_loss = evaluate_model(model, val_loader, correct_criterion, misconception_criterion, specific_criterion)
        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, _, _, _ = [t.to(device) for t in batch]
            correct_logits, misconception_logits, specific_misconception_logits = model(input_ids, attention_mask)
            val_preds.append(torch.cat([correct_logits, misconception_logits, specific_misconception_logits], dim=1).cpu().numpy())
    
    oof_predictions[val_idx] = np.vstack(val_preds)
    
    fold_test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask = [t.to(device) for t in batch]
            correct_logits, misconception_logits, specific_misconception_logits = model(input_ids, attention_mask)
            fold_test_preds.append(torch.cat([correct_logits, misconception_logits, specific_misconception_logits], dim=1).cpu().numpy())
    test_predictions.append(np.vstack(fold_test_preds))
    
    del model, optimizer, scheduler
    gc.collect()
    torch.cuda.empty_cache()



--- Fold 1/5 ---




Epoch 1/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 1.7670, Val Loss: 1.1203
Epoch 2/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.9668, Val Loss: 0.8785
Epoch 3/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.7741, Val Loss: 0.7994
--- Fold 2/5 ---
Epoch 1/3




Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.9993, Val Loss: 0.7728
Epoch 2/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.7004, Val Loss: 0.6616
Epoch 3/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.5778, Val Loss: 0.6396
--- Fold 3/5 ---
Epoch 1/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.8255, Val Loss: 0.6614
Epoch 2/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.5751, Val Loss: 0.5758
Epoch 3/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.4671, Val Loss: 0.5384
--- Fold 4/5 ---
Epoch 1/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.7212, Val Loss: 0.5087
Epoch 2/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.4833, Val Loss: 0.4652
Epoch 3/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.3729, Val Loss: 0.4325
--- Fold 5/5 ---
Epoch 1/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.6390, Val Loss: 0.4584
Epoch 2/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.4021, Val Loss: 0.3744
Epoch 3/3


Training:   0%|          | 0/1835 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/459 [00:00<?, ?it/s]

Train Loss: 0.3042, Val Loss: 0.3565


In [6]:
# --- 4. Generate Submission File ---
print("--- Generating Submission File ---")

# Average predictions from all folds for the test set
avg_test_predictions = np.mean(test_predictions, axis=0)

# Extract predictions for each task
correct_preds_logits = avg_test_predictions[:, :2] # First 2 columns for is_correct
misconception_preds_logits = avg_test_predictions[:, 2:4] # Next 2 columns for has_misconception
specific_misconception_preds_logits = avg_test_predictions[:, 4:] # Remaining columns for specific misconceptions

# Convert logits to probabilities (if needed) or directly to predicted classes
# For classification, usually, argmax on logits is sufficient to get predicted class
predicted_is_correct = np.argmax(correct_preds_logits, axis=1) # 0 or 1
predicted_has_misconception = np.argmax(misconception_preds_logits, axis=1) # 0 or 1
predicted_misconception_id = np.argmax(specific_misconception_preds_logits, axis=1)

# Initialize columns for submission DataFrame
test_df['is_correct'] = predicted_is_correct
test_df['has_misconception'] = predicted_has_misconception
test_df['misconception_id'] = predicted_misconception_id

# Map numeric misconception IDs back to original labels
test_df['Misconception'] = test_df.apply(
    lambda row: reverse_misconception_map[row['misconception_id']] if row['has_misconception'] == 1 else 'NA',
    axis=1
)

# Determine the final Category based on is_correct and Misconception
def determine_category(row):
    if row['is_correct'] == 1: # True
        if row['has_misconception'] == 1 and row['Misconception'] != 'NA':
            return f"True_{row['Misconception']}"
        else:
            return "True_Neither"
    else: # False
        if row['has_misconception'] == 1 and row['Misconception'] != 'NA':
            return f"False_{row['Misconception']}"
        else:
            return "False_Neither"

test_df['Category'] = test_df.apply(determine_category, axis=1)

# Create the submission DataFrame
# The error "KeyError: 'Id'" implies test_df['Id'] was attempted.
# Kaggle datasets often use 'id' (lowercase) or 'student_id'. Assuming 'id' is correct.
submission_df = pd.DataFrame({
    'id': test_df['row_id'], # Corrected from 'Id' to 'id'
    'Category': test_df['Category']
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

--- Generating Submission File ---
Submission file 'submission.csv' created successfully.
