In [None]:
import os


!pip install --no-index --find-links=/kaggle/input/download-compatible-offline-dependencies/offline-packages nltk sentencepiece wordcloud

import nltk
nltk.data.path.append("/kaggle/input/download-compatible-offline-dependencies/nltk_data")

from nltk.corpus import stopwords, wordnet
# print("Stopwords example:", stopwords.words("english")[:5])
# print("WordNet example:", wordnet.synsets("data")[0].definition())


print("Enhanced MAP2025 Complete Solution with Deep Learning Ensemble - OFFLINE VERSION")
print("Model Training + EDA Visualization + Validation and final Submission")

# Core libraries
import numpy as np
import pandas as pd
import re
import pickle
import warnings
import nltk
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# Set paths for offline model
OFFLINE_MODEL_PATH = "/kaggle/input/model-downloader/deberta-v3-small-offline"


# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import stopwords

# Suppress warnings
warnings.filterwarnings('ignore')

# Create directories

os.makedirs('models', exist_ok=True)
os.makedirs('eda_visualizations', exist_ok=True)
os.makedirs('validation_results', exist_ok=True)

# Configuration
def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DEVICE = get_device()
MAX_LENGTH = 512
BATCH_SIZE = 32
N_FOLDS = 10
EPOCHS = 10

print(f"Using device: {DEVICE}")

 
# SECTION 1: DATA PREPROCESSING & FEATURE ENGINEERING
 

def extract_math_features(text):
    if not isinstance(text, str):
        return {
            'frac_count': 0, 'number_count': 0, 'operator_count': 0,
            'decimal_count': 0, 'question_mark': 0, 'math_keyword_count': 0
        }
    features = {
        'frac_count': len(re.findall(r'FRAC_\d+_\d+|\\frac', text)),
        'number_count': len(re.findall(r'\b\d+\b', text)),
        'operator_count': len(re.findall(r'[\+\-\*\/\=]', text)),
        'decimal_count': len(re.findall(r'\d+\.\d+', text)),
        'question_mark': int('?' in text),
        'math_keyword_count': len(re.findall(r'solve|calculate|equation|fraction|decimal', text.lower()))
    }
    return features

def create_features(df):
    # Fill missing text
    for col in ['QuestionText', 'MC_Answer', 'StudentExplanation']:
        df[col] = df[col].fillna('')
    
    # Basic length features
    df['mc_answer_len'] = df['MC_Answer'].str.len()
    df['explanation_len'] = df['StudentExplanation'].str.len()
    df['question_len'] = df['QuestionText'].str.len()
    df['explanation_to_question_ratio'] = df['explanation_len'] / (df['question_len'] + 1)

    # Math-specific features
    for col in ['QuestionText', 'MC_Answer', 'StudentExplanation']:
        mf = df[col].apply(extract_math_features).apply(pd.Series)
        prefix = 'mc_' if col == 'MC_Answer' else 'exp_' if col == 'StudentExplanation' else ''
        mf.columns = [f'{prefix}{c}' for c in mf.columns]
        df = pd.concat([df, mf], axis=1)

    # Combined text for transformer
    df['sentence'] = (
        "Question: " + df['QuestionText'] +
        " Answer: " + df['MC_Answer'] +
        " Explanation: " + df['StudentExplanation']
    )
    return df

 
# (EDA)
 

def perform_eda(data):
    """Complete EDA with visualizations"""
    print("\n" + "="*60)
    print("PERFORMING EXPLORATORY DATA ANALYSIS")
    print("="*60)
    
    plt.style.use('ggplot')
    
    # 1. Class Distribution Analysis
    print("1. Analyzing class distributions...")
    fig, ax = plt.subplots(1, 2, figsize=(18, 7))
    
    # Category distribution
    cat_counts = data['Category'].value_counts()
    ax[0].bar(cat_counts.index, cat_counts.values, color='skyblue')
    ax[0].set_title('Category Distribution', fontsize=14)
    ax[0].set_ylabel('Count', fontsize=12)
    ax[0].tick_params(axis='x', rotation=45)
    
    # Misconception distribution
    misc_counts = data['Misconception'].value_counts()
    misc_counts = misc_counts.sort_values(ascending=False)
    top_10_misc = misc_counts.head(10)
    ax[1].bar(top_10_misc.index, top_10_misc.values, color='lightcoral')
    ax[1].set_title('Top 10 Misconceptions', fontsize=14)
    ax[1].set_ylabel('Count', fontsize=12)
    ax[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('eda_visualizations/class_distribution.png', dpi=300)
    plt.show()
    
    print("Class Distribution Summary:")
    print("Categories:")
    print(cat_counts)
    print("\nTop Misconceptions:")
    print(top_10_misc)
    
    # 2. Text Length Analysis
    print("\n2. Analyzing text lengths...")
    data['explanation_len'] = data['StudentExplanation'].str.len()
    
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))
    
    # Distribution of explanation lengths
    sns.histplot(data['explanation_len'], bins=50, kde=True, ax=ax[0])
    ax[0].set_title('Distribution of Explanation Lengths', fontsize=14)
    ax[0].set_xlabel('Character Count')
    ax[0].set_ylabel('Frequency')
    
    # Length vs Category
    top_categories = data['Category'].value_counts().index[:5]
    sns.boxplot(
        x='Category', 
        y='explanation_len', 
        data=data[data['Category'].isin(top_categories)],
        ax=ax[1]
    )
    ax[1].set_title('Explanation Length by Top 5 Categories', fontsize=14)
    ax[1].set_ylabel('Character Count')
    ax[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('eda_visualizations/text_length_analysis.png', dpi=300)
    plt.show()
    
    print("Text Length Statistics:")
    print(data['explanation_len'].describe())
    
    # 3. Feature Correlation Analysis
    print("\n3. Analyzing feature correlations...")
    # Create temp df with features
    temp_df = create_features(data.copy())
    
    # Select numeric features only for correlation analysis
    numeric_features = temp_df.select_dtypes(include=[np.number]).columns
    feature_cols_eda = [c for c in numeric_features if c not in ['row_id', 'QuestionId']]
    
    if len(feature_cols_eda) > 0:
        # Add target encodings
        le_cat = LabelEncoder()
        le_misc = LabelEncoder()
        temp_df['category_encoded'] = le_cat.fit_transform(temp_df['Category'])
        temp_df['misconception_encoded'] = le_misc.fit_transform(temp_df['Misconception'])
        
        # Correlation analysis
        features_df = temp_df[feature_cols_eda + ['category_encoded', 'misconception_encoded']].fillna(0)
        corr = features_df.corr()
        
        # Plot correlation heatmap for targets
        target_corrs = corr[['category_encoded', 'misconception_encoded']].copy()
        target_corrs = target_corrs.drop(['category_encoded', 'misconception_encoded'], axis=0)
        
        plt.figure(figsize=(12, 8))
        sns.heatmap(
            target_corrs, 
            cmap='coolwarm', 
            annot=True, 
            fmt=".2f",
            square=True
        )
        plt.title('Feature Correlation with Targets', fontsize=16)
        plt.tight_layout()
        plt.savefig('eda_visualizations/feature_correlation.png', dpi=300)
        plt.show()
        
        # Top correlations
        top_cat_corrs = corr['category_encoded'].sort_values(ascending=False)[1:6]
        top_misc_corrs = corr['misconception_encoded'].sort_values(ascending=False)[1:6]
        
        print("Top Features Correlated with Category:")
        print(top_cat_corrs)
        print("\nTop Features Correlated with Misconception:")
        print(top_misc_corrs)
    
    # 4. Word Cloud Analysis
    print("\n4. Creating word clouds...")
    base_stopwords = set(stopwords.words('english'))
    custom_stopwords = {'would', 'could', 'one', 'two', 'three'}
    stop_words = base_stopwords.union(custom_stopwords)
    
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        words = text.split()
        return ' '.join([w for w in words if w not in stop_words and len(w) > 2])
    
    data['clean_explanation'] = data['StudentExplanation'].apply(clean_text)
    
    # Generate word clouds for top misconceptions
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    axes = axes.flatten()
    
    top_misconceptions = data['Misconception'].value_counts().index[:6]
    
    for i, misc in enumerate(top_misconceptions):
        text = " ".join(data[data['Misconception'] == misc]['clean_explanation'])
        if not text:
            continue
            
        wordcloud = WordCloud(
            width=800, 
            height=400,
            background_color='white',
            max_words=50
        ).generate(text)
        
        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].set_title(f"Keywords: {misc}", fontsize=14)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('eda_visualizations/keyword_wordclouds.png', dpi=300)
    plt.show()
    
    print("EDA completed! Visualizations saved to /eda_visualizations")

 
# DEEP LEARNING MODEL
 

# Import transformers locally to avoid internet dependency
from transformers import DebertaV2Model, DebertaV2Tokenizer

class MathMisconceptionModel(nn.Module):
    def __init__(self, n_categories, n_misconceptions, feature_dim):
        super().__init__()
        # Load from offline path
        self.bert = DebertaV2Model.from_pretrained(OFFLINE_MODEL_PATH)
        self.tokenizer = DebertaV2Tokenizer.from_pretrained(OFFLINE_MODEL_PATH)
        self.feature_processor = nn.Sequential(
            nn.Linear(feature_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.category_head = nn.Sequential(
            nn.Linear(768 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, n_categories)
        )
        self.misconception_head = nn.Sequential(
            nn.Linear(768 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, n_misconceptions)
        )

    def forward(self, input_texts, features):
        tokens = self.tokenizer(
            input_texts,
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        ).to(DEVICE)
        outputs = self.bert(**tokens)
        text_emb = outputs.last_hidden_state[:, 0, :]
        feat_emb = self.feature_processor(features)
        combined = torch.cat([text_emb, feat_emb], dim=1)
        
        return self.category_head(combined), self.misconception_head(combined)

 
# DATASET & TRAINING
 
class MathDataset(Dataset):
    def __init__(self, texts, features, cat_labels=None, misc_labels=None):
        self.texts = texts
        self.features = features
        self.cat_labels = cat_labels
        self.misc_labels = misc_labels
        self.has_labels = cat_labels is not None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {
            'text': self.texts[idx],
            'features': torch.tensor(self.features[idx], dtype=torch.float)
        }
        if self.has_labels:
            item['cat_labels'] = torch.tensor(self.cat_labels[idx], dtype=torch.long)
            item['misc_labels'] = torch.tensor(self.misc_labels[idx], dtype=torch.long)
        return item

def focal_loss(logits, targets, alpha=0.75, gamma=2.0):
    ce_loss = nn.CrossEntropyLoss(reduction='none')(logits, targets)
    pt = torch.exp(-ce_loss)
    return (alpha * (1 - pt) ** gamma * ce_loss).mean()

def train_model(model, loader, optimizer):
    model.train()
    total_loss = 0
    cat_preds, cat_targets = [], []
    misc_preds, misc_targets = [], []
    
    for batch in tqdm(loader, desc='Training'):
        optimizer.zero_grad()
        cat_logits, misc_logits = model(batch['text'], batch['features'].to(DEVICE))
        
        loss_cat = focal_loss(cat_logits, batch['cat_labels'].to(DEVICE))
        loss_misc = focal_loss(misc_logits, batch['misc_labels'].to(DEVICE))
        loss = 0.6 * loss_cat + 0.4 * loss_misc
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        cat_preds.extend(torch.argmax(cat_logits, 1).cpu().numpy())
        cat_targets.extend(batch['cat_labels'].numpy())
        misc_preds.extend(torch.argmax(misc_logits, 1).cpu().numpy())
        misc_targets.extend(batch['misc_labels'].numpy())

    f1_cat = f1_score(cat_targets, cat_preds, average='weighted')
    f1_misc = f1_score(misc_targets, misc_preds, average='weighted')
    return total_loss / len(loader), f1_cat, f1_misc

def validate_model(model, loader):
    model.eval()
    all_cat, all_misc = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Validating'):
            c_logits, m_logits = model(batch['text'], batch['features'].to(DEVICE))
            all_cat.append(torch.softmax(c_logits, 1).cpu().numpy())
            all_misc.append(torch.softmax(m_logits, 1).cpu().numpy())
    return np.vstack(all_cat), np.vstack(all_misc)

 
#VALIDATION & ACCURACY CALCULATION
 

def evaluate_fold_predictions(true_categories, pred_categories, true_misconceptions, pred_misconceptions, fold_num):
    """Calculate detailed accuracy metrics for a fold"""
    
    # Category metrics
    cat_accuracy = accuracy_score(true_categories, pred_categories)
    cat_f1 = f1_score(true_categories, pred_categories, average='weighted')
    
    # Misconception metrics
    misc_accuracy = accuracy_score(true_misconceptions, pred_misconceptions)
    misc_f1 = f1_score(true_misconceptions, pred_misconceptions, average='weighted')
    
    # Combined metric (what the competition uses)
    combined_predictions = [f"{cat}:{misc}" for cat, misc in zip(pred_categories, pred_misconceptions)]
    combined_true = [f"{cat}:{misc}" for cat, misc in zip(true_categories, true_misconceptions)]
    combined_accuracy = accuracy_score(combined_true, combined_predictions)
    
    results = {
        'fold': fold_num,
        'category_accuracy': cat_accuracy,
        'category_f1': cat_f1,
        'misconception_accuracy': misc_accuracy,
        'misconception_f1': misc_f1,
        'combined_accuracy': combined_accuracy
    }
    
    print(f"Fold {fold_num} Results:")
    print(f"  Category Accuracy: {cat_accuracy:.4f}")
    print(f"  Category F1: {cat_f1:.4f}")
    print(f"  Misconception Accuracy: {misc_accuracy:.4f}")
    print(f"  Misconception F1: {misc_f1:.4f}")
    print(f"  Combined Accuracy: {combined_accuracy:.4f}")
    
    return results

def save_detailed_predictions(df, fold_predictions, cat_enc, misc_enc):
    """Save detailed predictions with probabilities and comparisons"""
    
    detailed_results = []
    
    for fold_num, (indices, cat_probs, misc_probs) in fold_predictions.items():
        fold_df = df.iloc[indices].copy()
        
        # Get predictions
        cat_preds = np.argmax(cat_probs, axis=1)
        misc_preds = np.argmax(misc_probs, axis=1)
        
        # Convert to labels
        pred_categories = cat_enc.inverse_transform(cat_preds)
        pred_misconceptions = misc_enc.inverse_transform(misc_preds)
        
        # Create detailed dataframe
        for i, idx in enumerate(indices):
            result = {
                'fold': fold_num,
                'row_id': df.iloc[idx]['row_id'],
                'QuestionText': df.iloc[idx]['QuestionText'][:100] + "...",  # Truncate for readability
                'MC_Answer': df.iloc[idx]['MC_Answer'],
                'StudentExplanation': df.iloc[idx]['StudentExplanation'][:150] + "...",  # Truncate
                'true_category': df.iloc[idx]['Category'],
                'predicted_category': pred_categories[i],
                'category_correct': df.iloc[idx]['Category'] == pred_categories[i],
                'true_misconception': df.iloc[idx]['Misconception'],
                'predicted_misconception': pred_misconceptions[i],
                'misconception_correct': df.iloc[idx]['Misconception'] == pred_misconceptions[i],
                'combined_correct': (df.iloc[idx]['Category'] == pred_categories[i]) and 
                                   (df.iloc[idx]['Misconception'] == pred_misconceptions[i])
            }
            
            # Add top 3 category probabilities
            top_cat_indices = np.argsort(cat_probs[i])[-3:][::-1]
            for j, cat_idx in enumerate(top_cat_indices):
                result[f'top_{j+1}_category'] = cat_enc.inverse_transform([cat_idx])[0]
                result[f'top_{j+1}_category_prob'] = cat_probs[i][cat_idx]
            
            # Add top 3 misconception probabilities
            top_misc_indices = np.argsort(misc_probs[i])[-3:][::-1]
            for j, misc_idx in enumerate(top_misc_indices):
                result[f'top_{j+1}_misconception'] = misc_enc.inverse_transform([misc_idx])[0]
                result[f'top_{j+1}_misconception_prob'] = misc_probs[i][misc_idx]
            
            detailed_results.append(result)
    
    # Save to CSV
    detailed_df = pd.DataFrame(detailed_results)
    detailed_df.to_csv('validation_results/detailed_predictions.csv', index=False)
    
    # Create summary by fold
    summary_results = []
    for fold_num in detailed_df['fold'].unique():
        fold_data = detailed_df[detailed_df['fold'] == fold_num]
        summary = {
            'fold': fold_num,
            'total_samples': len(fold_data),
            'category_accuracy': fold_data['category_correct'].mean(),
            'misconception_accuracy': fold_data['misconception_correct'].mean(),
            'combined_accuracy': fold_data['combined_correct'].mean()
        }
        summary_results.append(summary)
    
    summary_df = pd.DataFrame(summary_results)
    summary_df.to_csv('validation_results/fold_summary.csv', index=False)
    
    print("Detailed predictions saved to validation_results/detailed_predictions.csv")
    print("Fold summary saved to validation_results/fold_summary.csv")
    
    return detailed_df, summary_df

 
#MAIN WORKFLOW
 

def main():
    print("Starting Complete MAP2025 Solution Pipeline - OFFLINE VERSION")
    
    # Load datasets
    print("\nLoading datasets...")
    train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
    test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    
    # Fill missing values
    train['Misconception'] = train['Misconception'].fillna('NA')
    
    # PERFORM EDA
    perform_eda(train)
    
    # Feature engineering
    print("\nCreating features...")
    train = create_features(train)
    test = create_features(test)
    
    # Encode targets
    cat_enc = LabelEncoder().fit(train['Category'])
    misc_enc = LabelEncoder().fit(train['Misconception'])
    train['category_encoded'] = cat_enc.transform(train['Category'])
    train['misconception_encoded'] = misc_enc.transform(train['Misconception'])
    
    # Save encoders
    with open('models/cat_encoder.pkl', 'wb') as f:
        pickle.dump(cat_enc, f)
    with open('models/misc_encoder.pkl', 'wb') as f:
        pickle.dump(misc_enc, f)
    print("Label encoders saved.")
    
    # Select features - only numeric columns
    drop_cols = [
        'Category', 'Misconception', 'sentence',
        'QuestionText', 'MC_Answer', 'StudentExplanation',
        'category_encoded', 'misconception_encoded',
        'clean_explanation'
    ]
    
    # Get only numeric columns and exclude unwanted ones
    numeric_cols = train.select_dtypes(include=[np.number]).columns
    feature_cols = [c for c in numeric_cols if c not in drop_cols and not c.startswith('Unnamed')]
    
    print(f"Selected {len(feature_cols)} features: {feature_cols[:10]}...")
    
    # Save feature columns
    with open('models/feature_cols.pkl', 'wb') as f:
        pickle.dump(feature_cols, f)
    print("Feature columns saved.")
    
    # Scale features
    for col in feature_cols:
        if col not in test.columns:
            test[col] = 0
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train[feature_cols].fillna(0))
    X_test = scaler.transform(test[feature_cols].fillna(0))
    
    # Prepare data
    texts_train = train['sentence'].tolist()
    texts_test = test['sentence'].tolist()
    y_cat = train['category_encoded'].values
    y_misc = train['misconception_encoded'].values
    
    # Cross-validation setup
    kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    oof_cat_preds = np.zeros((len(train), len(cat_enc.classes_)))
    oof_misc_preds = np.zeros((len(train), len(misc_enc.classes_)))
    
    # For validation tracking
    fold_predictions = {}
    fold_metrics = []
    
    # Test predictions
    test_cat_trad = np.zeros((len(test), len(cat_enc.classes_)))
    test_cat_dl = np.zeros((len(test), len(cat_enc.classes_)))
    test_misc_trad = np.zeros((len(test), len(misc_enc.classes_)))
    test_misc_dl = np.zeros((len(test), len(misc_enc.classes_)))
    
    # Cross-validation training
    print(f"\nStarting {N_FOLDS}-Fold Cross-Validation Training...")
    
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train, y_cat)):
        print(f"\n====== Fold {fold+1}/{N_FOLDS} ======")
        X_tr, X_va = X_train[tr_idx], X_train[va_idx]
        ycat_tr, ycat_va = y_cat[tr_idx], y_cat[va_idx]
        ymis_tr, ymis_va = y_misc[tr_idx], y_misc[va_idx]
        txt_tr = [texts_train[i] for i in tr_idx]
        txt_va = [texts_train[i] for i in va_idx]
        
        # Traditional models
        print("Training traditional models...")
        cat_model = LogisticRegression(class_weight='balanced', max_iter=1000, solver='lbfgs')
        misc_model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
        cat_model.fit(X_tr, ycat_tr)
        misc_model.fit(X_tr, ymis_tr)
        
        # Traditional predictions
        test_cat_trad += cat_model.predict_proba(X_test) / N_FOLDS
        test_misc_trad += misc_model.predict_proba(X_test) / N_FOLDS
        
        # Deep Learning
        print("Training deep learning model...")
        train_ds = MathDataset(txt_tr, X_tr, ycat_tr, ymis_tr)
        val_ds = MathDataset(txt_va, X_va, ycat_va, ymis_va)
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
        test_ds = MathDataset(texts_test, X_test)
        test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)
        
        model = MathMisconceptionModel(
            n_categories=len(cat_enc.classes_),
            n_misconceptions=len(misc_enc.classes_),
            feature_dim=X_train.shape[1]
        ).to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
        
        best_score = 0
        for epoch in range(1, EPOCHS+1):
            tr_loss, tr_f1_cat, tr_f1_misc = train_model(model, train_loader, optimizer)
            val_cat_probs, _ = validate_model(model, val_loader)
            val_preds = np.argmax(val_cat_probs, axis=1)
            val_score = f1_score(ycat_va, val_preds, average='weighted')
            
            print(f"Epoch {epoch}/{EPOCHS} | Loss: {tr_loss:.4f} | Cat F1: {tr_f1_cat:.4f} | Val Cat F1: {val_score:.4f}")
            
            if val_score > best_score:
                best_score = val_score
                torch.save(model.state_dict(), f'models/map_2025_best_model_fold{fold}.pt')
        
        # Load best model and get predictions
        model.load_state_dict(torch.load(f'models/map_2025_best_model_fold{fold}.pt'))
        
        # OOF validation predictions
        val_cat_probs, val_misc_probs = validate_model(model, val_loader)
        oof_cat_preds[va_idx] = val_cat_probs
        oof_misc_preds[va_idx] = val_misc_probs
        
        # Store fold predictions for validation
        fold_predictions[fold] = (va_idx, val_cat_probs, val_misc_probs)
        
        # Calculate fold metrics
        true_cats = train.iloc[va_idx]['Category'].values
        true_miscs = train.iloc[va_idx]['Misconception'].values
        pred_cats = cat_enc.inverse_transform(np.argmax(val_cat_probs, axis=1))
        pred_miscs = misc_enc.inverse_transform(np.argmax(val_misc_probs, axis=1))
        
        fold_result = evaluate_fold_predictions(true_cats, pred_cats, true_miscs, pred_miscs, fold)
        fold_metrics.append(fold_result)
        
        # Test predictions
        te_cat_probs, te_misc_probs = validate_model(model, test_loader)
        test_cat_dl += te_cat_probs / N_FOLDS
        test_misc_dl += te_misc_probs / N_FOLDS
    
     
    # VALIDATION ANALYSIS & DETAILED REPORTING
     
    
    print("\n" + "="*60)
    print("VALIDATION ANALYSIS & DETAILED REPORTING")
    print("="*60)
    
    # Save detailed predictions
    detailed_df, summary_df = save_detailed_predictions(train, fold_predictions, cat_enc, misc_enc)
    
    # Overall validation metrics
    print("\nOverall Cross-Validation Results:")
    metrics_df = pd.DataFrame(fold_metrics)
    
    overall_metrics = {
        'Mean Category Accuracy': metrics_df['category_accuracy'].mean(),
        'Std Category Accuracy': metrics_df['category_accuracy'].std(),
        'Mean Category F1': metrics_df['category_f1'].mean(),
        'Std Category F1': metrics_df['category_f1'].std(),
        'Mean Misconception Accuracy': metrics_df['misconception_accuracy'].mean(),
        'Std Misconception Accuracy': metrics_df['misconception_accuracy'].std(),
        'Mean Misconception F1': metrics_df['misconception_f1'].mean(),
        'Std Misconception F1': metrics_df['misconception_f1'].std(),
        'Mean Combined Accuracy': metrics_df['combined_accuracy'].mean(),
        'Std Combined Accuracy': metrics_df['combined_accuracy'].std()
    }
    
    for metric, value in overall_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Save metrics
    metrics_df.to_csv('validation_results/cv_metrics.csv', index=False)
    
    # Create confusion matrices for best fold
    best_fold_idx = metrics_df['combined_accuracy'].idxmax()
    best_fold_num = metrics_df.iloc[best_fold_idx]['fold']
    best_fold_data = detailed_df[detailed_df['fold'] == best_fold_num]
    
    # Category confusion matrix
    plt.figure(figsize=(12, 8))
    cm_cat = confusion_matrix(best_fold_data['true_category'], best_fold_data['predicted_category'])
    sns.heatmap(cm_cat, annot=True, fmt='d', cmap='Blues', 
                xticklabels=cat_enc.classes_, yticklabels=cat_enc.classes_)
    plt.title(f'Category Confusion Matrix - Best Fold ({best_fold_num})')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('validation_results/category_confusion_matrix.png', dpi=300)
    plt.show()
    
    # Top misconceptions confusion matrix (limit to top 10)
    top_misconceptions = train['Misconception'].value_counts().head(10).index
    best_fold_top_misc = best_fold_data[
        (best_fold_data['true_misconception'].isin(top_misconceptions)) |
        (best_fold_data['predicted_misconception'].isin(top_misconceptions))
    ]
    
    if len(best_fold_top_misc) > 0:
        plt.figure(figsize=(15, 12))
        cm_misc = confusion_matrix(best_fold_top_misc['true_misconception'], 
                                   best_fold_top_misc['predicted_misconception'])
        
        # Get unique labels from both true and predicted
        unique_labels = sorted(list(set(best_fold_top_misc['true_misconception'].unique()) | 
                                   set(best_fold_top_misc['predicted_misconception'].unique())))
        
        sns.heatmap(cm_misc, annot=True, fmt='d', cmap='Reds',
                    xticklabels=unique_labels[:len(cm_misc[0])], 
                    yticklabels=unique_labels[:len(cm_misc)])
        plt.title(f'Top Misconceptions Confusion Matrix - Best Fold ({best_fold_num})')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig('validation_results/misconception_confusion_matrix.png', dpi=300)
        plt.show()
    
    # Error analysis 
    print("\nError Analysis:")
    
    # Category errors
    cat_errors = detailed_df[detailed_df['category_correct'] == False]
    if len(cat_errors) > 0:
        print("\nMost Common Category Prediction Errors:")
        error_summary = cat_errors.groupby(['true_category', 'predicted_category']).size().sort_values(ascending=False).head(10)
        for (true_cat, pred_cat), count in error_summary.items():
            print(f"  {true_cat} → {pred_cat}: {count} times")
    
    # Misconception errors
    misc_errors = detailed_df[detailed_df['misconception_correct'] == False]
    if len(misc_errors) > 0:
        print("\nMost Common Misconception Prediction Errors:")
        error_summary = misc_errors.groupby(['true_misconception', 'predicted_misconception']).size().sort_values(ascending=False).head(10)
        for (true_misc, pred_misc), count in error_summary.items():
            print(f"  {true_misc} → {pred_misc}: {count} times")
    
    # Performance by category
    print("\nPerformance by Category:")
    category_performance = detailed_df.groupby('true_category').agg({
        'category_correct': 'mean',
        'misconception_correct': 'mean',
        'combined_correct': 'mean'
    }).round(4)
    category_performance.columns = ['Category_Accuracy', 'Misconception_Accuracy', 'Combined_Accuracy']
    print(category_performance)
    
    category_performance.to_csv('validation_results/category_performance.csv')
        
    
     
    
    print("\n" + "="*60)
    print("CREATING FINAL ENSEMBLE & SUBMISSION")
    print("="*60)
    
    # Ensemble predictions for test set
    test_cat_probs = 0.7 * test_cat_dl + 0.3 * test_cat_trad
    test_misc_probs = 0.7 * test_misc_dl + 0.3 * test_misc_trad
    
    test_cat = np.argmax(test_cat_probs, axis=1)
    test_misc = np.argmax(test_misc_probs, axis=1)
    
    # Convert to original labels
    test['Category'] = cat_enc.inverse_transform(test_cat)
    test['Misconception'] = misc_enc.inverse_transform(test_misc)
    test['Category:Misconception'] = test['Category'] + ':' + test['Misconception']
    
    # Create submission
    submission = test[['row_id', 'Category:Misconception']]
    submission.to_csv('submission.csv', index=False)
    print('Submission saved to submission.csv')
    
    # Show sample predictions
    print("\nSample Test Predictions:")
    sample_preds = test[['row_id', 'QuestionText', 'MC_Answer', 'StudentExplanation', 
                        'Category', 'Misconception', 'Category:Misconception']].head(10)
    
    for idx, row in sample_preds.iterrows():
        print(f"\nRow {row['row_id']}:")
        print(f"  Question: {row['QuestionText'][:100]}...")
        print(f"  Answer: {row['MC_Answer']}")
        print(f"  Explanation: {row['StudentExplanation'][:100]}...")
        print(f"  Predicted: {row['Category:Misconception']}")
    
     
    # FINAL VALIDATION WITH TRAIN.CSV     
    
    print("\n" + "="*60)
    print("FINAL VALIDATION WITH TRAIN.CSV")
    print("="*60)
    
    # Load original training data for comparison
    original_train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
    original_train['Misconception'] = original_train['Misconception'].fillna('NA')
    
    # Get OOF predictions for entire training set
    oof_cat_preds_labels = cat_enc.inverse_transform(np.argmax(oof_cat_preds, axis=1))
    oof_misc_preds_labels = misc_enc.inverse_transform(np.argmax(oof_misc_preds, axis=1))
    oof_combined = [f"{cat}:{misc}" for cat, misc in zip(oof_cat_preds_labels, oof_misc_preds_labels)]
    
    # True labels
    true_combined = [f"{cat}:{misc}" for cat, misc in zip(original_train['Category'], original_train['Misconception'])]
    
    # Final validation metrics
    final_cat_accuracy = accuracy_score(original_train['Category'], oof_cat_preds_labels)
    final_misc_accuracy = accuracy_score(original_train['Misconception'], oof_misc_preds_labels)
    final_combined_accuracy = accuracy_score(true_combined, oof_combined)
    
    final_cat_f1 = f1_score(original_train['Category'], oof_cat_preds_labels, average='weighted')
    final_misc_f1 = f1_score(original_train['Misconception'], oof_misc_preds_labels, average='weighted')
    final_combined_f1 = f1_score(true_combined, oof_combined, average='weighted')
    
    print("FINAL OUT-OF-FOLD VALIDATION RESULTS:")
    print(f"Category Accuracy: {final_cat_accuracy:.4f}")
    print(f"Category F1 Score: {final_cat_f1:.4f}")
    print(f"Misconception Accuracy: {final_misc_accuracy:.4f}")
    print(f"Misconception F1 Score: {final_misc_f1:.4f}")
    print(f"Combined Accuracy: {final_combined_accuracy:.4f}")
    print(f"Combined F1 Score: {final_combined_f1:.4f}")
    
    # Create final comparison dataframe
    final_comparison = pd.DataFrame({
        'row_id': original_train['row_id'],
        'QuestionText': original_train['QuestionText'].str[:100] + "...",
        'MC_Answer': original_train['MC_Answer'],
        'StudentExplanation': original_train['StudentExplanation'].str[:150] + "...",
        'true_category': original_train['Category'],
        'predicted_category': oof_cat_preds_labels,
        'category_correct': original_train['Category'] == oof_cat_preds_labels,
        'true_misconception': original_train['Misconception'],
        'predicted_misconception': oof_misc_preds_labels,
        'misconception_correct': original_train['Misconception'] == oof_misc_preds_labels,
        'true_combined': true_combined,
        'predicted_combined': oof_combined,
        'combined_correct': np.array(true_combined) == np.array(oof_combined)
    })
    
    # Add prediction probabilities
    for i, class_name in enumerate(cat_enc.classes_):
        final_comparison[f'prob_cat_{class_name}'] = oof_cat_preds[:, i]
    
    for i, class_name in enumerate(misc_enc.classes_):
        final_comparison[f'prob_misc_{class_name}'] = oof_misc_preds[:, i]
    
    final_comparison.to_csv('validation_results/final_train_comparison.csv', index=False)
    print("Final train comparison saved to validation_results/final_train_comparison.csv")
    
    # Classification reports
    print("\nDETAILED CLASSIFICATION REPORTS:")
    
    print("\nCategory Classification Report:")
    cat_report = classification_report(original_train['Category'], oof_cat_preds_labels)
    print(cat_report)
    
    print("\nMisconception Classification Report (Top 10):")
    top_misconceptions = original_train['Misconception'].value_counts().head(10).index
    mask = original_train['Misconception'].isin(top_misconceptions)
    misc_report = classification_report(
        original_train[mask]['Misconception'], 
        np.array(oof_misc_preds_labels)[mask]
    )
    print(misc_report)
    
    # Save classification reports
    with open('validation_results/classification_reports.txt', 'w') as f:
        f.write("CATEGORY CLASSIFICATION REPORT\n")
        f.write("="*50 + "\n")
        f.write(cat_report)
        f.write("\n\nMISCONCEPTION CLASSIFICATION REPORT (TOP 10)\n")
        f.write("="*50 + "\n")
        f.write(misc_report)
         
    # FINAL SUMMARY     
    
    print("\n" + "="*80)
    print(" COMPLETE MAP2025 SOLUTION PIPELINE FINISHED!")
    print("="*80)    
        
    print(f"\n Final Model Performance:")
    print(f"    • Combined Accuracy: {final_combined_accuracy:.4f}")
    print(f"    • Combined F1 Score: {final_combined_f1:.4f}")
    print(f"    • Category F1 Score: {final_cat_f1:.4f}")
    print(f"    • Misconception F1 Score: {final_misc_f1:.4f}")
    
    print("\n All tasks completed successfully!")
    

if __name__ == '__main__':
    main()