# 02. Level Predictor Model

Сурагчийн түвшинг тодорхойлох модел сургах

**Input features:**
- Зөв хариултын тоо
- Дундаж хугацаа
- Хүнд асуултанд зөв хариулсан эсэх
- Сэдэв тус бүрийн оноо

**Output:**
- Түвшин (1-10)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Synthetic data үүсгэх (жинхэнэ өгөгдөл ороогүй үед)
np.random.seed(42)

n_samples = 1000

# Features
data = {
    'correct_ratio': np.random.uniform(0.1, 1.0, n_samples),
    'avg_time_seconds': np.random.uniform(30, 300, n_samples),
    'hard_questions_correct': np.random.uniform(0, 1, n_samples),
    'math_score': np.random.uniform(0, 100, n_samples),
    'physics_score': np.random.uniform(0, 100, n_samples),
    'chemistry_score': np.random.uniform(0, 100, n_samples),
}

df = pd.DataFrame(data)

# Create target (level 1-10)
# Based on weighted combination
weighted_score = (
    df['correct_ratio'] * 40 +
    df['hard_questions_correct'] * 30 +
    (df['math_score'] + df['physics_score'] + df['chemistry_score']) / 3 * 0.3
)

# Add some noise
weighted_score += np.random.normal(0, 5, n_samples)

# Convert to levels 1-10
df['level'] = pd.cut(weighted_score, bins=10, labels=range(1, 11)).astype(int)

print(df.head())
print(f'\nLevel distribution:\n{df["level"].value_counts().sort_index()}')

In [None]:
# Train/Test split
X = df.drop('level', axis=1)
y = df['level']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Train size: {len(X_train)}')
print(f'Test size: {len(X_test)}')

In [None]:
# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

results = {}

for name, model in models.items():
    print(f'\n--- {name} ---')
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    print(f'CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})')
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Test
    test_score = model.score(X_test_scaled, y_test)
    print(f'Test Score: {test_score:.4f}')
    
    results[name] = {'cv_score': cv_scores.mean(), 'test_score': test_score, 'model': model}

In [None]:
# Best model selection
best_model_name = max(results, key=lambda x: results[x]['test_score'])
best_model = results[best_model_name]['model']

print(f'Best Model: {best_model_name}')

# Detailed evaluation
y_pred = best_model.predict(X_test_scaled)
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

In [None]:
# Feature importance
if hasattr(best_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance, x='importance', y='feature')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()

In [None]:
# Save model
model_data = {
    'model': best_model,
    'scaler': scaler,
    'features': list(X.columns)
}

joblib.dump(model_data, '../trained_models/level_predictor.pkl')
print('Model saved to: ../trained_models/level_predictor.pkl')

In [None]:
# Test loading
loaded = joblib.load('../trained_models/level_predictor.pkl')

# Predict on new data
sample = {
    'correct_ratio': 0.75,
    'avg_time_seconds': 120,
    'hard_questions_correct': 0.6,
    'math_score': 80,
    'physics_score': 70,
    'chemistry_score': 75
}

sample_df = pd.DataFrame([sample])
sample_scaled = loaded['scaler'].transform(sample_df)
prediction = loaded['model'].predict(sample_scaled)

print(f'Sample prediction: Level {prediction[0]}')