# XGBoost Model for Salary Prediction

This notebook implements an XGBoost model optimized for high F1 score (>0.90) on the salary prediction dataset.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


## 1. Load and Prepare Data


In [None]:
# Load processed data
train_data = pd.read_csv('./for_cursur/Data/salary.train.processed.csv', index_col='id')
test_data = pd.read_csv('./for_cursur/Data/salary.test.processed.csv', index_col='id')
live_data = pd.read_csv('./for_cursur/Data/salary.live.processed.csv', index_col='id')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Live data shape: {live_data.shape}")

# Check target distribution
print("\nTarget distribution in training data:")
print(train_data['label'].value_counts())
print(f"\nClass balance: {train_data['label'].mean():.3f}")


## 2. Prepare Features and Target


In [None]:
# Prepare features and target
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']
X_live = live_data

print(f"Feature matrix shape: {X_train.shape}")
print(f"Features: {list(X_train.columns)}")

# Calculate class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"\nClass weights: {class_weight_dict}")


## 3. Hyperparameter Tuning for XGBoost


In [None]:
# Manual hyperparameter tuning (since GridSearchCV can be slow)
def evaluate_xgb_params(params, X, y, cv_folds=5):
    """Evaluate XGBoost parameters using cross-validation"""
    model = xgb.XGBClassifier(
        **params,
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbosity=0
    )
    
    cv_scores = cross_val_score(
        model, X, y, 
        cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1
    )
    
    return cv_scores.mean(), cv_scores.std()

# Test different parameter combinations
best_score = 0
best_params = None
results = []

# Test key parameter combinations optimized for F1 score
test_params = [
    {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.9, 
     'colsample_bytree': 0.9, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'scale_pos_weight': 2},
    {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.01, 'subsample': 0.8, 
     'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 2, 'scale_pos_weight': 3},
    {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 1.0, 
     'colsample_bytree': 1.0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1},
    {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.03, 'subsample': 0.85, 
     'colsample_bytree': 0.85, 'reg_alpha': 0.5, 'reg_lambda': 1.5, 'scale_pos_weight': 2.5},
    {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.02, 'subsample': 0.9, 
     'colsample_bytree': 0.9, 'reg_alpha': 0.2, 'reg_lambda': 1.8, 'scale_pos_weight': 2.2}
]

print("Testing parameter combinations...")
for i, params in enumerate(test_params):
    print(f"Testing combination {i+1}/{len(test_params)}...")
    mean_score, std_score = evaluate_xgb_params(params, X_train, y_train)
    results.append((params, mean_score, std_score))
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = params
    
    print(f"F1 Score: {mean_score:.4f} (+/- {std_score:.4f})")

print(f"\nBest F1 Score: {best_score:.4f}")
print(f"Best Parameters: {best_params}")


## 4. Train Final XGBoost Model


In [None]:
# Train final model with best parameters
final_model = xgb.XGBClassifier(
    **best_params,
    random_state=42,
    eval_metric='logloss',
    early_stopping_rounds=50,
    verbosity=0
)

# Train the model
final_model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Number of features used: {final_model.n_features_in_}")
print(f"Number of estimators: {final_model.n_estimators}")


## 5. Model Evaluation


In [None]:
# Make predictions
y_pred_train = final_model.predict(X_train)
y_pred_test = final_model.predict(X_test)
y_pred_proba_test = final_model.predict_proba(X_test)[:, 1]

# Calculate metrics
train_f1 = f1_score(y_train, y_pred_train)
test_f1 = f1_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_pred_proba_test)

print("=== MODEL PERFORMANCE ===")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test AUC Score: {test_auc:.4f}")

# Detailed classification report
print("\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred_test))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Low Income', 'High Income'], 
            yticklabels=['Low Income', 'High Income'])
plt.title('Confusion Matrix - XGBoost Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


## 6. Make Predictions on Live Data


In [None]:
# Make predictions on live data
live_predictions = final_model.predict(X_live)
live_probabilities = final_model.predict_proba(X_live)[:, 1]

# Create prediction dataframe
live_results = pd.DataFrame({
    'id': X_live.index,
    'predicted_label': live_predictions,
    'probability_high_income': live_probabilities
})

print(f"Live data predictions completed!")
print(f"Number of predictions: {len(live_results)}")
print(f"High income predictions: {live_predictions.sum()}")
print(f"Low income predictions: {len(live_predictions) - live_predictions.sum()}")

# Show sample predictions
print("\nSample predictions:")
print(live_results.head(10))


## 7. Save Model and Results


In [None]:
# Save the trained model
joblib.dump(final_model, './for_cursur/xgb_model.joblib')

# Save predictions
live_results.to_csv('./for_cursur/xgb_predictions.csv', index=False)

# Save model configuration
model_config = {
    'model_type': 'XGBoost',
    'parameters': best_params,
    'test_f1_score': test_f1,
    'test_auc_score': test_auc,
    'cv_mean_f1': best_score,
    'feature_count': final_model.n_features_in_,
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

import json
with open('./for_cursur/xgb_model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

print("Model and results saved successfully!")
print(f"\nFinal Test F1 Score: {test_f1:.4f}")
print(f"Target achieved: {'✅ YES' if test_f1 >= 0.90 else '❌ NO'}")
