# XGBoost Model Training

This notebook trains an XGBoost gradient boosting model for mobile price classification.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

X_train = train_df.drop('price_range', axis=1)
y_train = train_df['price_range']
X_test = test_df.drop('price_range', axis=1)
y_test = test_df['price_range']

In [3]:
# Preprocessing
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols)],
    remainder='drop'
)

In [4]:
# Hyperparameter tuning
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.1, 0.3],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best parameters: {'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.3, 'clf__max_depth': 3, 'clf__n_estimators': 200, 'clf__subsample': 0.8}
Best CV score: 0.9081


In [5]:
# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.9400

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       100
           1       0.91      0.95      0.93       100
           2       0.92      0.87      0.89       100
           3       0.95      0.95      0.95       100

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400



In [6]:
# Feature importance
importance = best_model.named_steps['clf'].feature_importances_
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': importance}).sort_values('importance', ascending=False)
print("\nTop 10 Features:")
print(importance_df.head(10))


Top 10 Features:
          feature  importance
13            ram    0.408740
0   battery_power    0.109070
11      px_height    0.085563
12       px_width    0.076874
8       mobile_wt    0.035411
7           m_dep    0.027553
14           sc_h    0.025724
19           wifi    0.025117
6      int_memory    0.024302
4              fc    0.023830


In [7]:
# Save model
joblib.dump(best_model, 'saved_models/XGBoost_model.joblib')
print("Model saved!")

Model saved!
