# Full 900K Training - XGBoost + Character N-grams

**Based on 10K experiments**: Best config is n=200, d=6, char(3-5)

**Tests**:
1. char(3-5) + XGB(n=200, d=6, lr=0.1) - best from 10K
2. char(3-6) + XGB(n=200, d=6, lr=0.1) - wider range

**Expected**: ~20 min per model, Val F1: 0.70-0.75

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score
from time import time
import pickle

## Load Full Data

In [2]:
print("Loading datasets...")
train_df = pd.read_parquet('../Task_C/train.parquet')[:50000]
val_df = pd.read_parquet('../Task_C/validation.parquet')[:50000]
test_sample_df = pd.read_parquet('../Task_C/test_sample.parquet')
test_df = pd.read_parquet('../Task_C/test.parquet')

X_train = train_df['code'].values
y_train = train_df['label'].values

X_val = val_df['code'].values
y_val = val_df['label'].values

X_test = test_sample_df['code'].values
y_test = test_sample_df['label'].values

X_test_final = test_df['code'].values
test_ids = test_df['ID'].values

print(f"Train: {len(X_train):,}")
print(f"Val: {len(X_val):,}")
print(f"Test (with labels): {len(X_test):,}")
print(f"Test (final): {len(X_test_final):,}")

Loading datasets...
Train: 50,000
Val: 50,000
Test (with labels): 1,000
Test (final): 1,000


## Experiment 1: char(3-5) + XGB(n=200, d=6)

In [6]:
print("\n" + "="*80)
print("EXPERIMENT 1: char(3-5) + XGBoost(n=200, d=6, lr=0.1)")
print("="*80)

# Features
print("\nCreating TF-IDF features...")
tfidf_35 = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,
    min_df=2,
    sublinear_tf=True
)

start = time()
X_train_35 = tfidf_35.fit_transform(X_train)
X_val_35 = tfidf_35.transform(X_val)
X_test_35 = tfidf_35.transform(X_test)
X_test_final_35 = tfidf_35.transform(X_test_final)
print(f"Feature extraction time: {time()-start:.2f}s")
print(f"Shape: {X_train_35.shape}")

# Model
print("\nTraining XGBoost...")
xgb_35 = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

start = time()
xgb_35.fit(X_train_35, y_train)
train_time = time() - start
print(f"Training time: {train_time/60:.2f} min")

# Evaluate
y_val_pred = xgb_35.predict(X_val_35)
y_test_pred = xgb_35.predict(X_test_35)

val_f1 = f1_score(y_val, y_val_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nResults:")
print(f"  Val Macro F1: {val_f1:.4f}")
print(f"  Test Macro F1: {test_f1:.4f}")

print("\nPer-class (Test):")
print(classification_report(y_test, y_test_pred, 
                           target_names=['Human', 'AI', 'Hybrid', 'Adversarial'],
                           digits=4))

# Save
with open('../models/xgb_char35_900k.pkl', 'wb') as f:
    pickle.dump(xgb_35, f)
with open('../models/tfidf_char35_900k.pkl', 'wb') as f:
    pickle.dump(tfidf_35, f)

# Predictions on final test
y_test_final_35 = xgb_35.predict(X_test_final_35)
exp1_results = {'val_f1': val_f1, 'test_f1': test_f1, 'predictions': y_test_final_35}


EXPERIMENT 1: char(3-5) + XGBoost(n=200, d=6, lr=0.1)

Creating TF-IDF features...
Feature extraction time: 74.17s
Shape: (50000, 2000)

Training XGBoost...
Training time: 0.76 min

Results:
  Val Macro F1: 0.5881
  Test Macro F1: 0.5827

Per-class (Test):
              precision    recall  f1-score   support

       Human     0.8419    0.9513    0.8932       554
          AI     0.6496    0.7237    0.6846       228
      Hybrid     0.4444    0.1905    0.2667        84
 Adversarial     0.6310    0.3955    0.4862       134

    accuracy                         0.7610      1000
   macro avg     0.6417    0.5652    0.5827      1000
weighted avg     0.7364    0.7610    0.7385      1000



## Experiment 2: char(3-6) + XGB(n=200, d=6)

In [5]:
print("\n" + "="*80)
print("EXPERIMENT 2: char(3-6) + XGBoost(n=200, d=6, lr=0.1)")
print("="*80)

# Features
print("\nCreating TF-IDF features...")
tfidf_36 = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 6),
    max_features=1000,
    min_df=2,
    sublinear_tf=True
)

start = time()
X_train_36 = tfidf_36.fit_transform(X_train)
X_val_36 = tfidf_36.transform(X_val)
X_test_36 = tfidf_36.transform(X_test)
X_test_final_36 = tfidf_36.transform(X_test_final)
print(f"Feature extraction time: {time()-start:.2f}s")
print(f"Shape: {X_train_36.shape}")

# Model
print("\nTraining XGBoost...")
xgb_36 = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

start = time()
xgb_36.fit(X_train_36, y_train)
train_time = time() - start
print(f"Training time: {train_time/60:.2f} min")

# Evaluate
y_val_pred = xgb_36.predict(X_val_36)
y_test_pred = xgb_36.predict(X_test_36)

val_f1 = f1_score(y_val, y_val_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nResults:")
print(f"  Val Macro F1: {val_f1:.4f}")
print(f"  Test Macro F1: {test_f1:.4f}")

print("\nPer-class (Test):")
print(classification_report(y_test, y_test_pred, 
                           target_names=['Human', 'AI', 'Hybrid', 'Adversarial'],
                           digits=4))

# Save
with open('../models/xgb_char36_900k.pkl', 'wb') as f:
    pickle.dump(xgb_36, f)
with open('../models/tfidf_char36_900k.pkl', 'wb') as f:
    pickle.dump(tfidf_36, f)

# Predictions on final test
y_test_final_36 = xgb_36.predict(X_test_final_36)
exp2_results = {'val_f1': val_f1, 'test_f1': test_f1, 'predictions': y_test_final_36}


EXPERIMENT 2: char(3-6) + XGBoost(n=200, d=6, lr=0.1)

Creating TF-IDF features...
Feature extraction time: 124.65s
Shape: (50000, 1000)

Training XGBoost...
Training time: 0.40 min

Results:
  Val Macro F1: 0.5531
  Test Macro F1: 0.5309

Per-class (Test):
              precision    recall  f1-score   support

       Human     0.7960    0.9368    0.8607       554
          AI     0.6379    0.6798    0.6582       228
      Hybrid     0.3704    0.1190    0.1802        84
 Adversarial     0.5769    0.3358    0.4245       134

    accuracy                         0.7290      1000
   macro avg     0.5953    0.5179    0.5309      1000
weighted avg     0.6948    0.7290    0.6989      1000



## Compare & Generate Submission

In [13]:
print("\n" + "="*80)
print("FINAL COMPARISON")
print("="*80)

results_df = pd.DataFrame([
    {'Experiment': 'char(3-5)', 'Val F1': exp1_results['val_f1'], 'Test F1': exp1_results['test_f1']},
    {'Experiment': 'char(3-6)', 'Val F1': exp2_results['val_f1'], 'Test F1': exp2_results['test_f1']}
])

print(results_df.to_string(index=False))

# Choose best
best_idx = results_df['Val F1'].idxmax()
best_exp = results_df.loc[best_idx, 'Experiment']
print(f"\nBest model: {best_exp}")

# Save both model predictions
submission_35 = pd.DataFrame({
    'ID': test_ids,
    'label': exp1_results['predictions']
})

submission_36 = pd.DataFrame({
    'ID': test_ids,
    'label': exp2_results['predictions']
})

# Generate submission with best model
best_predictions = exp1_results['predictions'] if best_idx == 0 else exp2_results['predictions']

submission = pd.DataFrame({
    'ID': test_ids,
    'label': best_predictions
})

# Save all submissions
submission_35.to_csv('predictions/submission_char35_900k.csv', index=False)
submission_36.to_csv('predictions/submission_char36_900k.csv', index=False)
submission.to_csv('predictions/submission_900k.csv', index=False)

print(f"\nSubmissions saved:")
print(f"  char(3-5): predictions/submission_char35_900k.csv")
print(f"  char(3-6): predictions/submission_char36_900k.csv")
print(f"  Best ({best_exp}): predictions/submission_900k.csv")
print(f"\nBest submission preview:")
print(submission.head())


FINAL COMPARISON
Experiment   Val F1  Test F1
 char(3-5) 0.682161 0.655737
 char(3-6) 0.680291 0.659286

Best model: char(3-5)

Submissions saved:
  char(3-5): predictions/submission_char35_900k.csv
  char(3-6): predictions/submission_char36_900k.csv
  Best (char(3-5)): predictions/submission_900k.csv

Best submission preview:
     ID  label
0   437      3
1   721      0
2   911      0
3  1414      0
4  1856      3
