# Car Price Prediction Notebook

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
path = 'new_car_data.csv'
df = pd.read_csv(path)
df.sample(20)

## Feature Engineering

In [None]:
df['car_age'] = 2024 - df['model_year']
df.drop(columns=['ext_col'], inplace=True, axis=1)
df['age_mileage_interaction'] = df['car_age'] * df['milage']
df['avg_hp_by_year'] = df.groupby('model_year')['engin_hp'].transform('mean')

### Mileage Features

In [None]:
df['model_milage_avg'] = df.groupby('model')['milage'].transform('mean')
df['milage_minus_model_avg'] = df['milage'] - df['model_milage_avg']
df['milage_percentile'] = df['milage'].rank(pct=True)

## Train/Test Split

In [None]:
df.info()
print(df.isna().sum())
X = df.drop(columns=['price'], axis=1)
y = df['price']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=42, random_state=42)
df.info()
df.head(1)

## Baseline Model Comparison

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def check_all_models(x_train, x_test, y_train, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'XGB Random Forest': XGBRFRegressor(),
    }
    results = []
    for name, model in models.items():
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        train_r2 = model.score(x_train, y_train)
        test_r2  = model.score(x_test, y_test)
        mae      = mean_absolute_error(y_test, preds)
        mse      = mean_squared_error(y_test, preds)
        rmse     = np.sqrt(mse)
        results.append({
            'Model': name,
            'Train R²': train_r2,
            'Test R²': test_r2,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'Overfitting': test_r2 < train_r2,
            'Underfitting': train_r2 < 0.5 and test_r2 < 0.5
        })
    display(pd.DataFrame(results))

check_all_models(x_train, x_test, y_train, y_test)

## XGBoost Quick Tuning

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor

model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    early_stopping_rounds=50,
    n_estimators=1000,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    learning_rate=0.05,
    random_state=42
)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)
print(model.best_iteration)
print(model.best_score)
print(model.score(x_test, y_test))
dtrain = xgb.DMatrix(X, label=y)
params = model.get_xgb_params()
cv_results = xgb.cv(params, dtrain, num_boost_round=1000, nfold=5, metrics=('rmse',), early_stopping_rounds=50, as_pandas=True, seed=42)
len(cv_results)

## Optuna Hyperparameter Tuning

In [None]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int("n_estimators", 100, 2000),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),
        'gamma': trial.suggest_float("gamma", 0, 0.5),
        'subsample': trial.suggest_float("subsample", 0.5, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'reg_alpha': trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        'random_state': 42,
        'eval_metric': 'rmse',
        'early_stopping_rounds': 50
    }
    m = XGBRegressor(**params)
    m.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)
    preds = m.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500, n_jobs=-1)
print(study.best_value)
print(study.best_params)

## Final Model and Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
import joblib

best_model = XGBRegressor(
    **study.best_params,
    objective='reg:squarederror',
    eval_metric='rmse',
    early_stopping_rounds=50,
    random_state=42
)
best_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)
train_preds = best_model.predict(x_train)
test_preds  = best_model.predict(x_test)
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse  = np.sqrt(mean_squared_error(y_test, test_preds))
train_r2   = r2_score(y_train, train_preds)
test_r2    = r2_score(y_test, test_preds)
print(train_rmse)
print(test_rmse)
print(train_r2)
print(test_r2)
explainer = shap.Explainer(best_model)
shap_values = explainer(x_test)
shap.summary_plot(shap_values, x_test)
plt.scatter(y_test, test_preds, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Predicted vs Actual")
plt.show()
joblib.dump(best_model, 'best_model.pkl')

## Next Steps
- Load the saved model
- Integrate into a web API
- Deploy to production

## Acknowledgements

## End of Notebook

In [None]:
print("Notebook execution finished")