<a href="https://colab.research.google.com/github/EM-Sanmaya/autojudge/blob/main/Copy_of_AutoJudge_Training_Reference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================================
# ROBUST REGRESSION: CLASS-CONDITIONAL LINEAR SVR
# =====================================================

import numpy as np
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Reuse features from classification
X = X_selected                     # already built, stable features
y_score = df["problem_score"].values
y_class = df["problem_class"].values

# Train-test split (same seed for fairness)
idx = np.arange(len(y_score))
train_idx, test_idx = train_test_split(
    idx,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)

X_train, X_test = X[train_idx], X[test_idx]
y_train_score, y_test_score = y_score[train_idx], y_score[test_idx]
y_train_class, y_test_class = y_class[train_idx], y_class[test_idx]

# Containers
y_pred = np.zeros_like(y_test_score)

# ----------------------------
# Train separate regressors
# ----------------------------
for cls in ["easy", "medium", "hard"]:
    mask_train = (y_train_class == cls)
    mask_test  = (y_test_class == cls)

    if np.sum(mask_train) < 20:
        continue

    svr = LinearSVR(
        C=5.0,
        epsilon=0.1,
        max_iter=5000,
        random_state=42
    )

    svr.fit(X_train[mask_train], y_train_score[mask_train])
    y_pred[mask_test] = svr.predict(X_test[mask_test])

# ----------------------------
# Evaluation
# ----------------------------
mae = mean_absolute_error(y_test_score, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_score, y_pred))
r2 = r2_score(y_test_score, y_pred)

print("CLASS-CONDITIONAL REGRESSION RESULTS:")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"RÂ²   : {r2:.3f}")

CLASS-CONDITIONAL REGRESSION RESULTS:
MAE  : 0.793
RMSE : 1.010
RÂ²   : 0.790


In [None]:
# =====================================================
# EASY-AWARE FEATURE ENGINEERING + COST-SENSITIVE HIERARCHICAL SVM
# =====================================================

import re
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# -------------------------------
# 1. TF-IDF FEATURES
# -------------------------------
tfidf = TfidfVectorizer(
    max_features=12000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf = tfidf.fit_transform(df["combined_text"])

# -------------------------------
# 2. FEATURE EXTRACTION (HINTS + EASY SIGNALS)
# -------------------------------

ALGO_KEYWORDS = [
    "dp", "dynamic programming", "graph", "dfs", "bfs", "tree",
    "segment tree", "bitmask", "mask", "flow", "geometry",
    "binary search", "greedy", "recursion"
]

def extract_features(text):
    text = text.lower()
    words = text.split()

    # ----- Basic (from hints) -----
    text_length = len(text)
    math_symbols = len(re.findall(r"[+\-*/%=<>]", text))
    keyword_freq = sum(text.count(k) for k in ALGO_KEYWORDS)

    # ----- Easy-specific simplicity signals -----
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    sentence_count = text.count(".")
    has_algorithm = int(keyword_freq > 0)

    # ----- Constraint magnitude -----
    numbers = [int(n) for n in re.findall(r"\d+", text)]
    max_constraint = max(numbers) if numbers else 0

    # ----- Control-flow density -----
    control_tokens = len(re.findall(r"\b(if|for|while)\b", text))

    return [
        text_length,
        math_symbols,
        keyword_freq,
        avg_word_len,
        sentence_count,
        has_algorithm,
        max_constraint,
        control_tokens
    ]

extra_features = np.array([extract_features(t) for t in df["combined_text"]])

# -------------------------------
# 3. SCALE (chiÂ² safe)
# -------------------------------
scaler = MinMaxScaler()
extra_scaled = scaler.fit_transform(extra_features)

# -------------------------------
# 4. COMBINE FEATURES
# -------------------------------
X_full = hstack([X_tfidf, extra_scaled])
print("Combined feature shape:", X_full.shape)

# -------------------------------
# 5. FEATURE SELECTION
# -------------------------------
selector = SelectKBest(chi2, k=5000)
X_selected = selector.fit_transform(X_full, df["problem_class"])
print("Selected feature shape:", X_selected.shape)

y = df["problem_class"].values

# =====================================================
# HIERARCHICAL CLASSIFICATION (EASY-FOCUSED)
# =====================================================

# -------- Stage 1: Easy vs Non-Easy (COST-SENSITIVE) --------
y_stage1 = np.where(y == "easy", "easy", "non_easy")

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X_selected,
    y_stage1,
    test_size=0.2,
    random_state=42,
    stratify=y_stage1
)

stage1 = LinearSVC(
    class_weight={"easy": 2.5, "non_easy": 1.0},  # ðŸ”´ Boost Easy
    C=2.5,
    max_iter=8000
)

stage1.fit(X1_train, y1_train)
y1_pred = stage1.predict(X1_test)

print("\nStage-1 Accuracy (Easy vs Non-Easy):",
      accuracy_score(y1_test, y1_pred))

# -------- Stage 2: Medium vs Hard --------
mask = (y != "easy")
X_stage2 = X_selected[mask]
y_stage2 = y[mask]

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X_stage2,
    y_stage2,
    test_size=0.2,
    random_state=42,
    stratify=y_stage2
)

stage2 = LinearSVC(
    class_weight="balanced",
    C=2.5,
    max_iter=8000
)

stage2.fit(X2_train, y2_train)

# -------- Final Combined Prediction --------
_, X_final_test, _, y_final_true = train_test_split(
    X_selected,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

final_preds = []

for i in range(X_final_test.shape[0]):
    if stage1.predict(X_final_test[i])[0] == "easy":
        final_preds.append("easy")
    else:
        final_preds.append(stage2.predict(X_final_test[i])[0])

print("\nFINAL HIERARCHICAL ACCURACY:",
      accuracy_score(y_final_true, final_preds))

print("\nFINAL CLASSIFICATION REPORT:")
print(classification_report(y_final_true, final_preds))

Combined feature shape: (4112, 12008)
Selected feature shape: (4112, 5000)

Stage-1 Accuracy (Easy vs Non-Easy): 0.8080194410692588

FINAL HIERARCHICAL ACCURACY: 0.8031591737545565

FINAL CLASSIFICATION REPORT:
              precision    recall  f1-score   support

        easy       0.65      0.41      0.50       153
        hard       0.85      0.90      0.87       389
      medium       0.79      0.89      0.83       281

    accuracy                           0.80       823
   macro avg       0.76      0.73      0.74       823
weighted avg       0.79      0.80      0.79       823

