In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# APPROACH 1: FROZEN ENCODER + LOGISTIC REGRESSION

class FrozenEncoderClassifier:
    def __init__(self, model_name="microsoft/unixcoder-base", max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        print(f"Loading FROZEN encoder: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.encoder = AutoModel.from_pretrained(model_name)
        self.encoder = self.encoder.to(self.device)
        self.encoder.eval()

        for param in self.encoder.parameters():
            param.requires_grad = False

        self.classifier = None

    def get_embeddings(self, texts, batch_size=32):
        """get embeddings without fine-tuning the encoder"""
        all_embeddings = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
            batch_texts = texts[i:i+batch_size]

            inputs = self.tokenizer(
                batch_texts,
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            ).to(self.device)

            with torch.no_grad():
                outputs = self.encoder(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

            all_embeddings.append(embeddings)

        return np.vstack(all_embeddings)

    def train(self, train_texts, train_labels, val_texts=None, val_labels=None):
        """Train ONLY the classifier, NOT the encoder"""
        print("\nExtracting training embeddings...")
        train_embeddings = self.get_embeddings(train_texts)

        print(f"Training classifier on {len(train_embeddings)} examples...")
        self.classifier = LogisticRegression(
            C=0.1,  
            max_iter=1000,
            class_weight='balanced',
            random_state=42
        )
        self.classifier.fit(train_embeddings, train_labels)

        if val_texts is not None:
            print("\nValidating...")
            val_embeddings = self.get_embeddings(val_texts)
            val_preds = self.classifier.predict(val_embeddings)
            val_f1 = f1_score(val_labels, val_preds, average='macro')
            print(f"Validation Macro F1: {val_f1:.4f}")
            print(classification_report(val_labels, val_preds, target_names=['Human', 'AI']))

        return self

    def predict(self, texts):
        """Predict"""
        embeddings = self.get_embeddings(texts)
        return self.classifier.predict(embeddings)

    def predict_proba(self, texts):
        """Predict probabilities"""
        embeddings = self.get_embeddings(texts)
        return self.classifier.predict_proba(embeddings)

# APPROACH 2: MANUAL FEATURE ENGINEERING

import re
from collections import Counter

class CodeFeatureExtractor:
    """Extract universal features that distinguish AI from Human"""

    @staticmethod
    def extract_features(code):
        """Features that do NOT depend on the specific language"""
        features = {}

        # 1. LENGTH AND STRUCTURE
        features['length'] = len(code)
        features['lines'] = code.count('\n') + 1
        features['avg_line_length'] = features['length'] / max(features['lines'], 1)

        # 2. COMMENTS
        features['comment_lines'] = len(re.findall(r'^\s*[#//]', code, re.MULTILINE))
        features['comment_ratio'] = features['comment_lines'] / max(features['lines'], 1)

        # 3. WHITESPACE PATTERNS
        lines = code.split('\n')
        indents = [len(line) - len(line.lstrip()) for line in lines if line.strip()]
        features['avg_indent'] = np.mean(indents) if indents else 0
        features['indent_std'] = np.std(indents) if indents else 0

        # 4. NAMING PATTERNS
        words = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code)
        features['unique_words'] = len(set(words)) / max(len(words), 1)
        features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0

        # 5. SYNTACTIC COMPLEXITY
        features['parentheses'] = code.count('(')
        features['brackets'] = code.count('[')
        features['braces'] = code.count('{')

        # 6. SPECIFIC PATTERNS
        features['has_docstring'] = int('"""' in code or "'''" in code)
        features['has_type_hints'] = int(':' in code and '->' in code)

        # 7. REPETITION
        line_hashes = [hash(line.strip()) for line in lines if line.strip()]
        features['repeated_lines'] = 1 - len(set(line_hashes)) / max(len(line_hashes), 1)

        return features

class FeatureBasedClassifier:
    """Classifier based on manual features"""

    def __init__(self):
        self.feature_extractor = CodeFeatureExtractor()
        self.classifier = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=20,
            class_weight='balanced',
            random_state=42
        )

    def extract_all_features(self, codes):
        all_features = []
        for code in tqdm(codes, desc="Extracting features"):
            features = self.feature_extractor.extract_features(code)
            all_features.append(list(features.values()))
        return np.array(all_features)

    def train(self, train_codes, train_labels, val_codes=None, val_labels=None):
        print("Extracting training features...")
        X_train = self.extract_all_features(train_codes)

        print("Training classifier...")
        self.classifier.fit(X_train, train_labels)

        if val_codes is not None:
            print("\nValidating...")
            X_val = self.extract_all_features(val_codes)
            val_preds = self.classifier.predict(X_val)
            val_f1 = f1_score(val_labels, val_preds, average='macro')
            print(f"\nValidation Macro F1: {val_f1:.4f}")
            print(classification_report(val_labels, val_preds, target_names=['Human', 'AI']))

        return self

    def predict(self, codes):
        X = self.extract_all_features(codes)
        return self.classifier.predict(X)

    def predict_proba(self, codes):
        X = self.extract_all_features(codes)
        return self.classifier.predict_proba(X)

# APPROACH 3: ENSEMBLE OF FROZEN MODELS

class MultiModelFrozenEnsemble:
    """Ensemble of different FROZEN models"""

    def __init__(self):
        self.models = [
            FrozenEncoderClassifier("microsoft/unixcoder-base"),
            FrozenEncoderClassifier("microsoft/graphcodebert-base"),
            FrozenEncoderClassifier("microsoft/codebert-base"),
        ]
        self.weights = [0.4, 0.4, 0.2] 

    def train(self, train_texts, train_labels, train_size=20000):
        """Train all models on a small subset"""
        indices = np.random.choice(len(train_texts), min(train_size, len(train_texts)), replace=False)
        subset_texts = [train_texts[i] for i in indices]
        subset_labels = [train_labels[i] for i in indices]

        for i, model in enumerate(self.models):
            print(f"\nTraining model {i+1}/{len(self.models)}")
            model.train(subset_texts, subset_labels)

    def predict_proba(self, texts):
        """Ensemble prediction"""
        all_probs = []
        for model, weight in zip(self.models, self.weights):
            probs = model.predict_proba(texts)[:, 1]
            all_probs.append(probs * weight)

        return np.sum(all_probs, axis=0)

    def predict(self, texts, threshold=0.5):
        probs = self.predict_proba(texts)
        return (probs >= threshold).astype(int)

# MAIN: TRAINING AND EVALUATION

def main():
    # Load data
    print("Loading datasets...")
    train_dataset = load_dataset("parquet", data_files=TRAIN_PARQUET, split="train")
    val_dataset = load_dataset("parquet", data_files=VAL_PARQUET, split="train")
    test_dataset = load_dataset("parquet", data_files=TEST_PARQUET, split="train")

    TRAIN_SIZE = 20000

    print(f"\n USING ONLY {TRAIN_SIZE} TRAINING EXAMPLES")
    train_subset = train_dataset.shuffle(seed=42).select(range(TRAIN_SIZE))

    train_texts = [ex['code'] for ex in train_subset]
    train_labels = [ex['label'] for ex in train_subset]

    val_texts = [ex['code'] for ex in val_dataset.select(range(min(10000, len(val_dataset))))]
    val_labels = [ex['label'] for ex in val_dataset.select(range(min(10000, len(val_dataset))))]

    test_texts = [ex['code'] for ex in test_dataset]
    test_labels = [ex['label'] for ex in test_dataset]

    # FROZEN ENCODER + LOGISTIC REGRESSION

    print("\n" + "="*70)
    print("APPROACH 1: Frozen Encoder + Logistic Regression")
    print("="*70)

    model1 = FrozenEncoderClassifier("microsoft/unixcoder-base")
    model1.train(train_texts, train_labels, val_texts, val_labels)

    # Test
    test_preds1 = model1.predict(test_texts)
    test_f1_1 = f1_score(test_labels, test_preds1, average='macro')
    print(f"\nTest Macro F1: {test_f1_1:.4f}")
    print(classification_report(test_labels, test_preds1, target_names=['Human', 'AI']))

    # FEATURE-BASED
 
    print("\n" + "="*70)
    print("APPROACH 2: Feature-Based Classifier")
    print("="*70)

    model2 = FeatureBasedClassifier()
    model2.train(train_texts, train_labels, val_texts, val_labels)

    # Test
    test_preds2 = model2.predict(test_texts)
    test_f1_2 = f1_score(test_labels, test_preds2, average='macro')
    print(f"\nTest Macro F1: {test_f1_2:.4f}")
    print(classification_report(test_labels, test_preds2, target_names=['Human', 'AI']))

    # ENSEMBLE

    print("\n" + "="*70)
    print("APPROACH 3: Ensemble (Feature-Based + Frozen Encoders)")
    print("="*70)

    # Validation
    print("\nValidating...")
    val_probs1 = model1.predict_proba(val_texts)[:, 1]
    val_probs2 = model2.predict_proba(val_texts)[:, 1]

    val_ensemble_probs = 0.6 * val_probs1 + 0.4 * val_probs2
    val_ensemble_preds = (val_ensemble_probs >= 0.5).astype(int)

    val_f1_ensemble = f1_score(val_labels, val_ensemble_preds, average='macro')
    print(f"\nValidation Macro F1 (Ensemble): {val_f1_ensemble:.4f}")
    print(classification_report(val_labels, val_ensemble_preds, target_names=['Human', 'AI']))

    # Combine predictions
    probs1 = model1.predict_proba(test_texts)[:, 1]
    probs2 = model2.predict_proba(test_texts)[:, 1]

    # Weighted average
    ensemble_probs = 0.6 * probs1 + 0.4 * probs2
    ensemble_preds = (ensemble_probs >= 0.5).astype(int)

    test_f1_ensemble = f1_score(test_labels, ensemble_preds, average='macro')
    print(f"\nTest Macro F1 (Ensemble): {test_f1_ensemble:.4f}")
    print(classification_report(test_labels, ensemble_preds, target_names=['Human', 'AI']))

    # BEST MODEL

    print("\n" + "="*70)
    print("SUMMARY")
    print("="*70)
    print(f"Frozen Encoder:     {test_f1_1:.4f}")
    print(f"Feature-Based:      {test_f1_2:.4f}")
    print(f"Ensemble:           {test_f1_ensemble:.4f}")

    best_f1 = max(test_f1_1, test_f1_2, test_f1_ensemble)
    print(f"\nBEST MACRO F1: {best_f1:.4f}")


if __name__ == "__main__":
    TRAIN_PARQUET = "/content/drive/MyDrive/SemEval-2026-Task13/task_A/task_a_training_set_1.parquet"
    VAL_PARQUET = "/content/drive/MyDrive/SemEval-2026-Task13/task_A/task_a_validation_set.parquet"
    TEST_PARQUET = "/content/drive/MyDrive/SemEval-2026-Task13/task_A/task_a_test_set_sample.parquet"

    main()

Loading datasets...

 USING ONLY 20000 TRAINING EXAMPLES

APPROACH 1: Frozen Encoder + Logistic Regression
Loading FROZEN encoder: microsoft/unixcoder-base

Extracting training embeddings...


Extracting embeddings: 100%|██████████| 625/625 [09:00<00:00,  1.16it/s]


Training classifier on 20000 examples...

Validating...


Extracting embeddings: 100%|██████████| 313/313 [04:30<00:00,  1.16it/s]


Validation Macro F1: 0.9307
              precision    recall  f1-score   support

       Human       0.93      0.93      0.93      4883
          AI       0.93      0.93      0.93      5117

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



Extracting embeddings: 100%|██████████| 32/32 [00:27<00:00,  1.16it/s]



Test Macro F1: 0.4641
              precision    recall  f1-score   support

       Human       0.91      0.35      0.50       777
          AI       0.28      0.88      0.43       223

    accuracy                           0.47      1000
   macro avg       0.60      0.62      0.46      1000
weighted avg       0.77      0.47      0.49      1000


APPROACH 2: Feature-Based Classifier
Extracting training features...


Extracting features: 100%|██████████| 20000/20000 [00:03<00:00, 6496.49it/s]


Training classifier...

Validating...


Extracting features: 100%|██████████| 10000/10000 [00:01<00:00, 5720.24it/s]



Validation Macro F1: 0.9405
              precision    recall  f1-score   support

       Human       0.95      0.93      0.94      4883
          AI       0.93      0.95      0.94      5117

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



Extracting features: 100%|██████████| 1000/1000 [00:00<00:00, 5836.37it/s]



Test Macro F1: 0.3886
              precision    recall  f1-score   support

       Human       0.92      0.23      0.37       777
          AI       0.26      0.93      0.40       223

    accuracy                           0.39      1000
   macro avg       0.59      0.58      0.39      1000
weighted avg       0.77      0.39      0.38      1000


APPROACH 3: Ensemble (Feature-Based + Frozen Encoders)

Validating...


Extracting embeddings: 100%|██████████| 313/313 [04:30<00:00,  1.16it/s]
Extracting features: 100%|██████████| 10000/10000 [00:02<00:00, 4713.44it/s]



Validation Macro F1 (Ensemble): 0.9497
              precision    recall  f1-score   support

       Human       0.95      0.95      0.95      4883
          AI       0.95      0.95      0.95      5117

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



Extracting embeddings: 100%|██████████| 32/32 [00:27<00:00,  1.17it/s]
Extracting features: 100%|██████████| 1000/1000 [00:00<00:00, 4468.87it/s]


Test Macro F1 (Ensemble): 0.4523
              precision    recall  f1-score   support

       Human       0.94      0.32      0.47       777
          AI       0.28      0.93      0.43       223

    accuracy                           0.45      1000
   macro avg       0.61      0.62      0.45      1000
weighted avg       0.79      0.45      0.46      1000


SUMMARY
Frozen Encoder:     0.4641
Feature-Based:      0.3886
Ensemble:           0.4523

BEST MACRO F1: 0.4641



