In [1]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"},
    {"name": "pandas", "version": None}  # Added pandas
]

# Check and install packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

gdown is already installed.

numpy is already installed.

pandas is already installed.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

# Load the parquet file into a pandas DataFrame
output_path = '/content/Feature_Engineered_DF.parquet'
df = pd.read_parquet(output_path)

print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
df.head()

Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,...,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Ontario,20.0,23,91761,3500.0,V6,Blue,True,...,5,0.18,-0.02369,20,8,2020,0,32,41,33
1,Gasoline,Sedan,Elizabeth,22.0,22,7202,2000.0,I4,Black,False,...,8,-0.04,0.0,18,8,2020,3,26,38,36
2,Biodiesel,Pickup Truck,Omaha,22.690001,93,68134,6700.0,V8,White,True,...,5,7.4,3e-05,9,6,2020,0,30,46,34
3,Gasoline,SUV / Crossover,Clearwater,22.690001,163,33763,1500.0,I4,Black,True,...,5,0.97,0.50512,31,3,2020,0,30,37,32
4,Gasoline,SUV / Crossover,Chillicothe,18.0,25,64601,3000.0,V6,Blue,True,...,15,0.08,2e-05,16,8,2020,0,28,45,39




---



## **Random Forest**

In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=7, desc="Processing and Training") as pbar:

    # Sample 20% of the data from the original df
    df_sample = df.sample(frac=0.2, random_state=42)
    pbar.update(1)

    # Remove rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample[df_sample['price'] > 0]

    # Log transform the target variable
    df_sample['log_price'] = np.log(df_sample['price'])
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')
    num_columns.remove('log_price')  # Exclude the target columns
    pbar.update(1)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data into training and test sets
    X = df_sample.drop(columns=['price', 'log_price'])
    y = df_sample['log_price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

    # Define the RandomForestRegressor model
    rf = RandomForestRegressor(
        n_estimators=50,
        max_depth=10,
        min_samples_split=10,
        random_state=42
    )

    # Fit the model to the training data
    model = rf.fit(X_train_transformed, y_train)
    pbar.update(1)

# Make predictions
print("Making predictions...")
y_pred_log = model.predict(X_test_transformed)

# Apply exponential to reverse log transformation
y_pred = np.exp(y_pred_log)

# Evaluate the model
print("Evaluating the model...")
r2 = r2_score(np.exp(y_test), y_pred)

print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Display results
print(f"\nTrain size: {len(X_train):,} samples")
print(f"Test size: {len(X_test):,} samples")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime)} minutes.")

# Calculate additional metrics
mae = mean_absolute_error(np.exp(y_test), y_pred)
mse = mean_squared_error(np.exp(y_test), y_pred)
rmse = np.sqrt(mse)

# Output additional metrics
print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Processing and Training: 100%|██████████| 7/7 [39:04<00:00, 334.87s/it]


Making predictions...
Evaluating the model...


R-Squared Score (Accuracy): 68.44%

Train size: 480,006 samples
Test size: 120,002 samples

Overall runtime: 39 minutes.
Additional Metrics:
Mean Absolute Error: 3569
Mean Squared Error: 140793917
Root Mean Squared Error: 11866


In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings
warnings.filterwarnings('ignore')

# Start tracking overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=7, desc="Processing and Training") as pbar:

    # Sample 20% of the data from the original df
    df_sample = df.sample(frac=0.2, random_state=42)
    pbar.update(1)

    # Remove rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample[df_sample['price'] > 0]

    # Log transform the target variable
    df_sample['log_price'] = np.log(df_sample['price'])
    pbar.update(1)

    # Handle categorical columns
    cat_columns = df_sample.select_dtypes(include=['object']).columns.tolist()
    num_columns = df_sample.select_dtypes(exclude=['object']).columns.tolist()
    num_columns.remove('price')
    num_columns.remove('log_price')  # Exclude the target columns
    pbar.update(1)

    # Prepare the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
    pbar.update(1)

    # Split the data into training and test sets
    X = df_sample.drop(columns=['price', 'log_price'])
    y = df_sample['log_price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pbar.update(1)

    # Apply the preprocessing to the features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    pbar.update(1)

# Define the parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100,200],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10]
}

# Custom scorer that prints results immediately along with parameters
def custom_scorer(y_true, y_pred_log, params):
    y_pred = np.exp(y_pred_log)  # Apply exponential to reverse log transformation
    mse = mean_squared_error(np.exp(y_true), y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(np.exp(y_true), y_pred)
    r2 = r2_score(np.exp(y_true), y_pred)

    # Print the current parameters and their corresponding metrics
    print(f"\nParameters: {params}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R-Squared Score (Accuracy): {r2 * 100:.2f}%")

    return r2  # Return R-squared as the score

# Perform manual grid search
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid):
    rf = RandomForestRegressor(random_state=42, **params)

    # Fit the model to the training data
    model = rf.fit(X_train_transformed, y_train)

    # Make predictions
    y_pred_log = model.predict(X_test_transformed)

    # Score and print parameters and metrics
    score = custom_scorer(y_test, y_pred_log, params)

    # Update best score and parameters if the current score is better
    if score > best_score:
        best_score = score
        best_params = params

# Print best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

# Train final model with best parameters
best_model = RandomForestRegressor(random_state=42, **best_params)
best_model.fit(X_train_transformed, y_train)

# Final evaluation
y_pred_log = best_model.predict(X_test_transformed)
y_pred = np.exp(y_pred_log)  # Reverse the log transformation
final_r2 = r2_score(np.exp(y_test), y_pred)
final_mae = mean_absolute_error(np.exp(y_test), y_pred)
final_mse = mean_squared_error(np.exp(y_test), y_pred)
final_rmse = np.sqrt(final_mse)

print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training:  86%|████████▌ | 6/7 [00:09<00:01,  1.51s/it]



Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Mean Absolute Error: 3568.30
Root Mean Squared Error: 11835.69
R-Squared Score (Accuracy): 68.59%

Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Mean Absolute Error: 3553.77
Root Mean Squared Error: 11836.49
R-Squared Score (Accuracy): 68.59%

Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Mean Absolute Error: 3548.72
Root Mean Squared Error: 11844.00
R-Squared Score (Accuracy): 68.55%

Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
Mean Absolute Error: 3568.97
Root Mean Squared Error: 11865.66
R-Squared Score (Accuracy): 68.44%

Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Mean Absolute Error: 3554.23
Root Mean Squared Error: 11858.21
R-Squared Score (Accuracy): 68.48%

Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Mean Absolute Error: 3548.93
Root Mean Squared Error:

In [None]:
# Print best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best R-Squared Score: {best_score * 100:.2f}%")

# Display final metrics
print("\nFinal Model Metrics:")
print(f"R-Squared Score (Accuracy): {final_r2 * 100:.2f}%")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"Mean Squared Error: {final_mse:.2f}")
print(f"Root Mean Squared Error: {final_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  # Convert seconds to minutes

print(f"\nOverall runtime: {round(total_runtime, 2)} minutes.")