In [1]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"},
    {"name": "pandas", "version": None}  # Added pandas
]

# Check and install packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

gdown is already installed.

numpy is already installed.

pandas is already installed.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

# Load the parquet file into a pandas DataFrame
output_path = '/content/Feature_Engineered_DF.parquet'
df = pd.read_parquet(output_path)

print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
df.head()

Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,...,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Ontario,20.0,23,91761,3500.0,V6,Blue,True,...,5,0.18,-0.02369,20,8,2020,0,32,41,33
1,Gasoline,Sedan,Elizabeth,22.0,22,7202,2000.0,I4,Black,False,...,8,-0.04,0.0,18,8,2020,3,26,38,36
2,Biodiesel,Pickup Truck,Omaha,22.690001,93,68134,6700.0,V8,White,True,...,5,7.4,3e-05,9,6,2020,0,30,46,34
3,Gasoline,SUV / Crossover,Clearwater,22.690001,163,33763,1500.0,I4,Black,True,...,5,0.97,0.50512,31,3,2020,0,30,37,32
4,Gasoline,SUV / Crossover,Chillicothe,18.0,25,64601,3000.0,V6,Blue,True,...,15,0.08,2e-05,16,8,2020,0,28,45,39




---



## **XGB**

In [None]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import time

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=5, desc="Progress") as pbar:
    # Sample 10% of the data
    df_sample = df.sample(frac=0.4, random_state=42)  # Randomly sample 300k records
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Fill missing numeric values with mean
    df_sample[num_columns] = df_sample[num_columns].fillna(df_sample[num_columns].mean())
    pbar.update(1)

    # Preprocessing pipeline (scaling numeric features and encoding categorical features)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )

    # Split the data
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessor
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")
print(f"Train_DF has {len(X_train):,} rows and {X_train_transformed.shape[1]} columns")
print("-------------------------------------------------------------------------------------------------------------------------------")
# Model training
print("Training XGBoost model...")

# Initialize XGBoostRegressor
xgb_regressor = xgb.XGBRegressor(
    max_depth=6,
    n_estimators=100,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

# Track training time
start_time = time.time()

# Train the model
xgb_regressor.fit(X_train_transformed, y_train)

# Make predictions
y_pred = xgb_regressor.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\n\nOverall runtime: {round(total_runtime, 2)} minutes.")
print("-------------------------------------------------------------------------------------------------------------------------------")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing the data...


Progress: 100%|██████████| 5/5 [00:22<00:00,  4.60s/it]




Data preprocessing and splitting completed!
Train_DF has 960,012 rows and 39396 columns
-------------------------------------------------------------------------------------------------------------------------------
Training XGBoost model...

Train size: 960,012 samples
Test size: 240,004 samples


R-Squared Score (Accuracy): 87.34%


Overall runtime: 0.65 minutes.
-------------------------------------------------------------------------------------------------------------------------------

Additional Metrics:
Mean Absolute Error: 3021
Mean Squared Error: 45804700
Root Mean Squared Error: 6768


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import ParameterGrid
import numpy as np
import time
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# Track training time
start_time = time.time()

best_score = -np.inf
best_params = None

# Manual grid search using ParameterGrid
print("Performing manual grid search...")

for params in ParameterGrid(param_grid):
    # Initialize XGBoostRegressor with the current set of parameters
    xgb_regressor = xgb.XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42,
        **params  # Pass the parameters dynamically
    )

    # Fit the model
    xgb_regressor.fit(X_train_transformed, y_train)

    # Make predictions
    y_pred = xgb_regressor.predict(X_test_transformed)

    # Calculate R2 score for this parameter set
    r2 = r2_score(y_test, y_pred)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # Print the parameters and the metrics for this iteration
    print(f"\nParameters: {params}")
    print(f"Mean Test MSE: {mse:.2f}")
    print(f"Mean Test RMSE: {rmse:.2f}")
    print(f"R-Squared Score (Accuracy): {r2 * 100:.2f}%")

    # Update the best score and parameters if this iteration is better
    if r2 > best_score:
        best_score = r2
        best_params = params

# Print best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

# Train final model with best parameters
best_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    **best_params
)
best_model.fit(X_train_transformed, y_train)

# Final evaluation
y_pred = best_model.predict(X_test_transformed)
final_r2 = r2_score(y_test, y_pred)
final_mae = mean_absolute_error(y_test, y_pred)
final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

# Display final metrics
print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime, 2)} minutes.")
print("-------------------------------------------------------------------------------------------------------------------------------")


Performing manual grid search...

Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}
Mean Test MSE: 169233008.00
Mean Test RMSE: 13008.96
R-Squared Score (Accuracy): 53.24%

Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
Mean Test MSE: 168126816.00
Mean Test RMSE: 12966.37
R-Squared Score (Accuracy): 53.55%

Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Mean Test MSE: 122739560.00
Mean Test RMSE: 11078.79
R-Squared Score (Accuracy): 66.09%

Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Mean Test MSE: 121052592.00
Mean Test RMSE: 11002.39
R-Squared Score (Accuracy): 66.55%

Parameters: {'colsample_bytree': 0.8, 'learning_rate':

In [None]:
# Print best parameters
print(f"Best Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes
print(f"\nOverall runtime: {round(total_runtime)} minutes.")

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 1.0}
Best R-Squared Score: 88.90%

Final Model Metrics:
R-Squared Score (Accuracy): 88.90%
Mean Absolute Error: 2642.86
Mean Squared Error: 40191072.00
Root Mean Squared Error: 6339.64
Overall runtime: 456.62 minutes
