In [1]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"},
    {"name": "pandas", "version": None}  # Added pandas
]

# Check and install packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

gdown is already installed.

numpy is already installed.

pandas is already installed.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

# Load the parquet file into a pandas DataFrame
output_path = '/content/Feature_Engineered_DF.parquet'
df = pd.read_parquet(output_path)

print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
df.head()

Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,...,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Ontario,20.0,23,91761,3500.0,V6,Blue,True,...,5,0.18,-0.02369,20,8,2020,0,32,41,33
1,Gasoline,Sedan,Elizabeth,22.0,22,7202,2000.0,I4,Black,False,...,8,-0.04,0.0,18,8,2020,3,26,38,36
2,Biodiesel,Pickup Truck,Omaha,22.690001,93,68134,6700.0,V8,White,True,...,5,7.4,3e-05,9,6,2020,0,30,46,34
3,Gasoline,SUV / Crossover,Clearwater,22.690001,163,33763,1500.0,I4,Black,True,...,5,0.97,0.50512,31,3,2020,0,30,37,32
4,Gasoline,SUV / Crossover,Chillicothe,18.0,25,64601,3000.0,V6,Blue,True,...,15,0.08,2e-05,16,8,2020,0,28,45,39




---



# **Models**

### **Decision Trees**

In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sample 20% of the data from the original df
    df_sample = df.sample(frac=0.4, random_state=42)
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Convert 'franchise_dealer' to numeric if it's not already
    df_sample['franchise_dealer'] = df_sample['franchise_dealer'].astype(int)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

    # Train Decision Tree Regressor model
    dt = DecisionTreeRegressor(
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    )

    model = dt.fit(X_train_transformed, y_train)
    pbar.update(1)

# Make predictions
y_pred = model.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

# Display results
print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")

# Multiply R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\n\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output additional metrics
print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing and Training: 100%|██████████| 6/6 [08:42<00:00, 87.04s/it] 


Train size: 960,012 samples
Test size: 240,004 samples


R-Squared Score (Accuracy): 83.21%


Overall runtime: 9 minutes.
Additional Metrics:
Mean Absolute Error: 2666
Mean Squared Error: 60784033
Root Mean Squared Error: 7796





In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sample 20% of the data from the original df
    df_sample = df.sample(frac=0.2, random_state=42)
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Convert 'franchise_dealer' to numeric if it's not already
    df_sample['franchise_dealer'] = df_sample['franchise_dealer'].astype(int)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

    # Train Decision Tree Regressor model
    dt = DecisionTreeRegressor(
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    )

    model = dt.fit(X_train_transformed, y_train)
    pbar.update(1)

# Make predictions
y_pred = model.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

# Display results
print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")

# Multiply R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\n\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output additional metrics
print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing and Training: 100%|██████████| 6/6 [04:53<00:00, 48.97s/it]


Train size: 480,006 samples
Test size: 120,002 samples


R-Squared Score (Accuracy): 71.45%


Overall runtime: 5 minutes.
Additional Metrics:
Mean Absolute Error: 2806
Mean Squared Error: 127337218
Root Mean Squared Error: 11284





In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sample 40% of the data from the original df
    df_sample = df.sample(frac=0.4, random_state=42)
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Convert 'franchise_dealer' to numeric if it's not already
    df_sample['franchise_dealer'] = df_sample['franchise_dealer'].astype(int)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

# Define the parameter grid
param_grid = {
    'max_depth': [10, 20, None],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [5, 10],
    'criterion': ['squared_error', 'absolute_error']
}

# Custom scorer that prints results immediately along with parameters
def custom_scorer(y_true, y_pred, params):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Print the current parameters and their corresponding metrics
    print(f"\nParameters: {params}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R-Squared Score (Accuracy): {r2 * 100:.2f}%")

    return r2  # Return R-squared as the score

# Perform manual grid search
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid):
    dt = DecisionTreeRegressor(random_state=42, **params)
    dt.fit(X_train_transformed, y_train)

    # Make predictions on the test set
    y_pred = dt.predict(X_test_transformed)

    # Score and print parameters and metrics
    score = custom_scorer(y_test, y_pred, params)

    # Update best score and parameters if the current score is better
    if score > best_score:
        best_score = score
        best_params = params

# Print best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

# Train final model with best parameters
best_model = DecisionTreeRegressor(random_state=42, **best_params)
best_model.fit(X_train_transformed, y_train)

# Final evaluation
y_pred = best_model.predict(X_test_transformed)
final_r2 = r2_score(y_test, y_pred)
final_mae = mean_absolute_error(y_test, y_pred)
final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


## **XGB**

In [None]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import time

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=5, desc="Progress") as pbar:

    # Sample 10% of the data
    df_sample = df.sample(frac=0.4, random_state=42)  # Randomly sample 300k records
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Fill missing numeric values with mean
    df_sample[num_columns] = df_sample[num_columns].fillna(df_sample[num_columns].mean())
    pbar.update(1)

    # Preprocessing pipeline (scaling numeric features and encoding categorical features)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )

    # Split the data
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessor
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")

print(f"Train_DF has {len(X_train):,} rows and {X_train_transformed.shape[1]} columns")
print("-------------------------------------------------------------------------------------------------------------------------------")

# Model training
print("Training XGBoost model...")

# Initialize XGBoostRegressor
xgb_regressor = xgb.XGBRegressor(
    max_depth=6,
    n_estimators=100,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

# Track training time
start_time = time.time()

# Train the model
xgb_regressor.fit(X_train_transformed, y_train)

# Make predictions
y_pred = xgb_regressor.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\n\nOverall runtime: {round(total_runtime, 2)} minutes.")
print("-------------------------------------------------------------------------------------------------------------------------------")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing the data...


Progress: 100%|██████████| 5/5 [00:14<00:00,  2.90s/it]




Data preprocessing and splitting completed!
Train_DF has 960,012 rows and 39396 columns
-------------------------------------------------------------------------------------------------------------------------------
Training XGBoost model...

Train size: 960,012 samples
Test size: 240,004 samples


R-Squared Score (Accuracy): 87.34%


Overall runtime: 0.58 minutes.
-------------------------------------------------------------------------------------------------------------------------------

Additional Metrics:
Mean Absolute Error: 3021
Mean Squared Error: 45804700
Root Mean Squared Error: 6768


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import ParameterGrid
import numpy as np
import time
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# Track training time
start_time = time.time()

best_score = -np.inf
best_params = None

# Manual grid search using ParameterGrid
print("Performing manual grid search...")

for params in ParameterGrid(param_grid):
    # Initialize XGBoostRegressor with the current set of parameters
    xgb_regressor = xgb.XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42,
        **params  # Pass the parameters dynamically
    )

    # Fit the model
    xgb_regressor.fit(X_train_transformed, y_train)

    # Make predictions
    y_pred = xgb_regressor.predict(X_test_transformed)

    # Calculate R2 score for this parameter set
    r2 = r2_score(y_test, y_pred)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # Print the parameters and the metrics for this iteration
    print(f"\nParameters: {params}")
    print(f"Mean Test MSE: {mse:.2f}")
    print(f"Mean Test RMSE: {rmse:.2f}")
    print(f"R-Squared Score (Accuracy): {r2 * 100:.2f}%")

    # Update the best score and parameters if this iteration is better
    if r2 > best_score:
        best_score = r2
        best_params = params

# Print best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

# Train final model with best parameters
best_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    **best_params
)
best_model.fit(X_train_transformed, y_train)

# Final evaluation
y_pred = best_model.predict(X_test_transformed)
final_r2 = r2_score(y_test, y_pred)
final_mae = mean_absolute_error(y_test, y_pred)
final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

# Display final metrics
print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime, 2)} minutes.")
print("-------------------------------------------------------------------------------------------------------------------------------")


## **GBT Regressor**

In [None]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import time

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=5, desc="Progress") as pbar:

    # Sample 20% of the data
    df_sample = df.sample(frac=0.4, random_state=42)  # Randomly sample 600k records
    pbar.update(1)

    # Identify categorical and numerical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Fill missing values for numeric columns with mean
    df_sample[num_columns] = df_sample[num_columns].fillna(df_sample[num_columns].mean())
    pbar.update(1)

    # Preprocessing: scale numerical features and one-hot encode categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data into training and testing sets
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessor to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

print("\n\nData preprocessing and splitting completed!")

# Model training
print("Training Gradient Boosting Regressor model...")

# Start tracking overall runtime
start_time = time.time()

# Initialize GradientBoostingRegressor
gbt_regressor = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    random_state=42
)

# Train the model
gbt_regressor.fit(X_train_transformed, y_train)

# Make predictions
print("Making predictions...")
y_pred = gbt_regressor.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\n\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing the data...


Progress: 100%|██████████| 5/5 [00:14<00:00,  2.99s/it]




Data preprocessing and splitting completed!
Training Gradient Boosting Regressor model...
Making predictions...

Train size: 960,012 samples
Test size: 240,004 samples


R-Squared Score (Accuracy): 85.40%


Overall runtime: 26 minutes.

Additional Metrics:
Mean Absolute Error: 3588
Mean Squared Error: 52827087
Root Mean Squared Error: 7268


## **Random Forest**

In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=7, desc="Processing and Training") as pbar:

    # Sample 20% of the data from the original df
    df_sample = df.sample(frac=0.2, random_state=42)
    pbar.update(1)

    # Remove rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample[df_sample['price'] > 0]

    # Log transform the target variable
    df_sample['log_price'] = np.log(df_sample['price'])
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')
    num_columns.remove('log_price')  # Exclude the target columns
    pbar.update(1)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data into training and test sets
    X = df_sample.drop(columns=['price', 'log_price'])
    y = df_sample['log_price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

    # Define the RandomForestRegressor model
    rf = RandomForestRegressor(
        n_estimators=50,
        max_depth=10,
        min_samples_split=10,
        random_state=42
    )

    # Fit the model to the training data
    model = rf.fit(X_train_transformed, y_train)
    pbar.update(1)

# Make predictions
print("Making predictions...")
y_pred_log = model.predict(X_test_transformed)

# Apply exponential to reverse log transformation
y_pred = np.exp(y_pred_log)

# Evaluate the model
print("Evaluating the model...")
r2 = r2_score(np.exp(y_test), y_pred)

print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Display results
print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(np.exp(y_test), y_pred)
mse = mean_squared_error(np.exp(y_test), y_pred)
rmse = np.sqrt(mse)

# Output additional metrics
print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing and Training: 100%|██████████| 7/7 [39:04<00:00, 334.87s/it]


Making predictions...
Evaluating the model...


R-Squared Score (Accuracy): 68.44%

Train size: 480,006 samples
Test size: 120,002 samples

Overall runtime: 39 minutes.
Additional Metrics:
Mean Absolute Error: 3569
Mean Squared Error: 140793917
Root Mean Squared Error: 11866


## **Linear Regression**

In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sample 20% of the data
    df_sample = df.sample(frac=0.4, random_state=42)
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')  # Exclude the target column 'price'
    pbar.update(1)

    # Convert 'franchise_dealer' to numeric if necessary
    df_sample['franchise_dealer'] = df_sample['franchise_dealer'].astype(int)

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data into training and test sets
    X = df_sample.drop(columns='price')
    y = df_sample['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing pipeline
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

    # Train Linear Regression model
    lr = LinearRegression()
    model = lr.fit(X_train_transformed, y_train)
    pbar.update(1)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test_transformed)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

# Display results
print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")

# Multiply R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output additional metrics
print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing and Training: 100%|██████████| 6/6 [13:25<00:00, 134.32s/it]

Making predictions...

Train size: 960,012 samples
Test size: 240,004 samples


R-Squared Score (Accuracy): 81.86%

Overall runtime: 13 minutes.
Additional Metrics:
Mean Absolute Error: 4059
Mean Squared Error: 65651380
Root Mean Squared Error: 8103



