In [1]:
!pip install xgboost



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

file_path = r'C:/Users/Darkk/OneDrive/Desktop/cars/final_car.xlsx'

# Load the Excel file
df_cars_final = pd.read_excel(file_path)

categorical_features = df_cars_final.select_dtypes(include=['object']).columns
numerical_features = df_cars_final.select_dtypes(include=['number']).columns

# Separate the target variable
X = df_cars_final.drop('price', axis=1)
y = df_cars_final['price']

# Apply one-hot encoding to categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(f"Training feature set shape: {X_train.shape}")
print(f"Testing feature set shape: {X_test.shape}")

Training feature set shape: (6263, 316)
Testing feature set shape: (1566, 316)


In [3]:

# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}

# Store the results
results = {}
# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100
    
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100
    
    # Store the results
    results[model_name] = {
        'MSE_train': mse_train,
        'MAE_train': mae_train,
        'R2_train': r2_train,
        'MAPE_train': mape_train,
        'MSE_test': mse_test,
        'MAE_test':mae_test,
        'R2_test': r2_test,
        'MAPE_test': mape_test
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

                      MSE_train      MAE_train  R2_train  MAPE_train  \
Linear Regression  7.178467e+10  158572.249671  0.865367   28.463700   
Decision Tree      1.812904e+08     704.667625  0.999660    0.079229   
Random Forest      6.362064e+09   41600.658961  0.988068    6.022444   
XGBoost            9.945143e+09   65721.555442  0.981348   10.836095   

                       MSE_test       MAE_test   R2_test  MAPE_test  
Linear Regression  7.988207e+10  161289.733168  0.811879  29.684028  
Decision Tree      6.707526e+10  130917.766922  0.842039  18.239362  
Random Forest      3.416245e+10   99923.812095  0.919548  14.953844  
XGBoost            3.375828e+10   97552.215921  0.920500  14.406064  


In [4]:
#XGBoost
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distributions_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'alpha': [0, 0.1, 0.5, 1],
    'lambda': [0, 0.1, 0.5, 1]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_distributions_xgb,
    n_iter=50,
    scoring='neg_mean_absolute_percentage_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

xgb_random_search.fit(X_train, y_train)
best_xgb_params = xgb_random_search.best_params_
print(f"Best XGBoost Parameters: {best_xgb_params}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best XGBoost Parameters: {'alpha': 0.5, 'colsample_bytree': np.float64(0.6705331755251293), 'lambda': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': np.float64(0.9386696766904905)}


In [11]:
# Initialize and train the Random Forest model with best parameters
rf_model = RandomForestRegressor(n_estimators=300, 
                                 max_depth=5, 
                                 min_samples_split=2, 
                                 min_samples_leaf=1,
                                 random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred_rf = rf_model.predict(X_train)

# Calculate metrics for the training set
mse_train_rf = mean_squared_error(y_train, y_train_pred_rf)
r2_train_rf = r2_score(y_train, y_train_pred_rf)
mape_train_rf = mean_absolute_percentage_error(y_train, y_train_pred_rf) * 100

# Print metrics for the training set
print(f"Random Forest Train MSE: {mse_train_rf}")
print(f"Random Forest Train R^2: {r2_train_rf}")
print(f"Random Forest Train MAPE: {mape_train_rf}%")

# Make predictions on the test set
y_test_pred_rf = rf_model.predict(X_test)

# Calculate metrics for the test set
mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)
mape_test_rf = mean_absolute_percentage_error(y_test, y_test_pred_rf) * 100

# Print metrics for the test set
print(f"Random Forest Test MSE: {mse_test_rf}")
print(f"Random Forest Test R^2: {r2_test_rf}")
print(f"Random Forest Test MAPE: {mape_test_rf}%")

Random Forest Train MSE: 81416820414.9345
Random Forest Train R^2: 0.8473017180366543
Random Forest Train MAPE: 25.834345559759996%
Random Forest Test MSE: 76097144102.37793
Random Forest Test R^2: 0.820792336257405
Random Forest Test MAPE: 25.033820002890728%


In [13]:
#Best XGBoost Parameters: 

xgb_model = XGBRegressor(alpha=0.5, 
                         colsample_bytree=0.6705331755251293, 
                         lambda_=0, 
                         learning_rate=0.1, 
                         max_depth=5, 
                         n_estimators=300, 
                         subsample=0.9386696766904905, 
                         objective='reg:squarederror',
                         random_state=42)
xgb_model.fit(X_train, y_train)


# Make predictions
y_train_pred_xgb = xgb_model.predict(X_train)

# Calculate metrics
mse_train_xgb = mean_squared_error(y_train, y_train_pred_xgb)
r2_train_xgb = r2_score(y_train, y_train_pred_xgb)
mape_train_xgb = mean_absolute_percentage_error(y_train, y_train_pred_xgb) * 100

print(f"XGBoost Train MSE: {mse_train_xgb}")
print(f"XGBoost Train R^2: {r2_train_xgb}")
print(f"XGBoost Train MAPE: {mape_train_xgb}%")

# Make predictions
y_test_pred_xgb = xgb_model.predict(X_test)

# Calculate metrics
mse_test_xgb = mean_squared_error(y_test, y_test_pred_xgb)
r2_test_xgb = r2_score(y_test, y_test_pred_xgb)
mape_test_xgb = mean_absolute_percentage_error(y_test, y_test_pred_xgb) * 100

print(f"XGBoost Test MSE: {mse_test_xgb}")
print(f"XGBoost Test R^2: {r2_test_xgb}")
print(f"XGBoost Test MAPE: {mape_test_xgb}%")

Parameters: { "lambda_" } are not used.



XGBoost Train MSE: 12361176071.35584
XGBoost Train R^2: 0.9768164570966686
XGBoost Train MAPE: 12.428734735746211%
XGBoost Test MSE: 31687653737.574192
XGBoost Test R^2: 0.9253760379212769
XGBoost Test MAPE: 14.645574731494088%


In [14]:
import joblib

# Save the model
joblib.dump(xgb_model, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [15]:

# Assuming you have a test dataset 'X_test'
predictions = xgb_model.predict(X_test)

# Convert predictions to a DataFrame
df_predictions = pd.DataFrame(predictions, columns=['Predictions'])

# Save as CSV
df_predictions.to_csv('predictions.csv', index=False)

# Save as Excel
df_predictions.to_excel('predictions.xlsx', index=False)


In [16]:
# Save column names for encoding
encoded_columns = list(X_encoded.columns)
joblib.dump(encoded_columns, 'encoded_columns.pkl')

['encoded_columns.pkl']

In [17]:
# Get feature importance as a dictionary
feature_importance = xgb_model.get_booster().get_score(importance_type='weight')

# Convert to a DataFrame
df_importance = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])

# Save as CSV
df_importance.to_csv('feature_importance.csv', index=False)

# Save as Excel
df_importance.to_excel('feature_importance.xlsx', index=False)
