In [2]:
import pandas as pd
df_clean=pd.read_csv(r"D:\Car_price_prediction\notebooks\df_transformed.csv")

In [3]:
df_clean.head()

Unnamed: 0,model_year,milage,accident,clean_title,fuel_type_E85 Flex Fuel,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,fuel_type_not supported,fuel_type_–,...,transmission_Automatic CVT,transmission_CVT Transmission,transmission_M/T,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,transmission_–,brand,model,engine,price
0,-0.260339,-0.39579,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,1481.0,492.0,10300
1,1.040212,-0.700365,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1011.0,478.0,38005
2,0.064799,0.314224,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1055.0,630.0,15500
3,0.389936,0.222428,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1235.0,442.0,31000
4,-2.211165,3.18238,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,119.0,415.0,7300


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [5]:
y=df_clean['price']

In [7]:
X=df_clean.drop("price",axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import xgboost as xgb

# --- 1️⃣ Set MLflow experiment ---
mlflow.set_experiment("Car Price Prediction")

# --- 2️⃣ Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3️⃣ Define helper for logging runs ---
def mlflow_experiment(model, params, model_name):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # Log everything
        mlflow.log_params(params)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} => RMSE: {rmse:.3f}, R2: {r2:.3f}")

# --- 4️⃣ Linear Regression ---
lin_reg = LinearRegression()
params_lr = {}
mlflow_experiment(lin_reg, params_lr, "LinearRegression")

# --- 5️⃣ Random Forest (with Grid Search) ---
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}
rf = RandomForestRegressor(random_state=42)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
mlflow_experiment(best_rf, grid_rf.best_params_, "RandomForestRegressor")

# --- 6️⃣ Decision Tree (with Grid Search) ---
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeRegressor(random_state=42)
grid_dt = GridSearchCV(dt, param_grid_dt, cv=3, scoring='r2', n_jobs=-1)
grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_
mlflow_experiment(best_dt, grid_dt.best_params_, "DecisionTreeRegressor")

# --- 7️⃣ XGBoost Regressor (with Grid Search) ---
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
xgb_reg = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
grid_xgb = GridSearchCV(xgb_reg, param_grid_xgb, cv=3, scoring='r2', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
mlflow_experiment(best_xgb, grid_xgb.best_params_, "XGBoostRegressor")

print("\n✅ All experiments logged successfully. Run 'mlflow ui' to view the results.")




LinearRegression => RMSE: 117221.089, R2: 0.062




RandomForestRegressor => RMSE: 116888.870, R2: 0.067




DecisionTreeRegressor => RMSE: 119433.696, R2: 0.026




XGBoostRegressor => RMSE: 116669.670, R2: 0.070

✅ All experiments logged successfully. Run 'mlflow ui' to view the results.


In [17]:
import pickle

# Save the model to a pickle file
with open("best_xgb_model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)

print("✅ Best XGBoost model saved as 'best_xgb_model.pkl'")


✅ Best XGBoost model saved as 'best_xgb_model.pkl'
