# Random Forest Rangers - Predicting Car Sales Prices
## Preprocessing

In [None]:
# Install dependencies
!pip install -q -r requirements.txt

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
train_original = pd.read_csv('data/0_Data_Split/train.csv')
train_generated_and_original = pd.read_csv('data/0_Data_Split/train_generated_and_original.csv')
test = pd.read_csv('data/0_Data_Split/test.csv')

# Array of all the dataframes
datasets = [train_original, train_generated_and_original, test]

### Ensure consistent data formatting

In [None]:
# Function to reformat numeric attributes with non-numeric elements such as currency symbols
def reformat_non_numeric(text):
    if isinstance(text, (float, int)):
        return text
    return float(re.sub(r'[^\d.]', '', text))

# Function to reformat the clean title column
def reformat_clean_title(text):
    return 1 if text == 'Yes' else 0

# Function to reformat the accident column
def reformat_accident(text):
    return 0 if text == '' or text == 'None reported' else 1

for dataset in datasets:
    # Reformat the existing columns
    dataset['clean_title'] = dataset['clean_title'].apply(reformat_clean_title)
    dataset['price'] = dataset['price'].apply(reformat_non_numeric)
    dataset['milage'] = dataset['milage'].apply(reformat_non_numeric)
    dataset['accident'] = dataset['accident'].apply(reformat_accident)

### Extract additional features

In [None]:
# Function to extract number of cylinders
def extract_cylinders(text):
    match = re.search(r'(\d+)\s*Cylinder', text, re.IGNORECASE)
    return int(match.group(1)) if match else None

# Function to extract horsepower
def extract_hp(text):
    match = re.search(r'(\d+(?:\.\d+)?)\s*HP', text, re.IGNORECASE)
    return float(match.group(1)) if match else None

# Function to extract cubic capacity
def extract_capacity(text):
    match = re.search(r'(\d+(?:\.\d+)?)\s*L', text, re.IGNORECASE)
    return float(match.group(1)) if match else None

# Function to extract whether the car has a turbo
def extract_turbo(text):
    match = re.search(r'Turbo', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract gear type
def extract_gear_type(text):
    match_automatic = re.search(r'Automatic|A/T', text, re.IGNORECASE)
    match_manual = re.search(r'Manual|M/T', text, re.IGNORECASE)
    return 1 if match_automatic else 0 if match_manual else None

# Function to extract the number of gears
def extract_gears(text):
    match = re.search(r'(\d+)-Speed\b', text, re.IGNORECASE)
    return int(match.group(1)) if match else None

# Function to extract whether the car has dual shift
def extract_dual_shift(text):
    match = re.search(r'Dual Shift', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract whether the car has a monochrome color
monochrome = ['Black', 'White', 'Silver', 'Gray']
def extract_monochrome_color(text):
    for color in monochrome:
        match = re.search(color, text, re.IGNORECASE)
        if match:
            return 1
    return 0

# Function to check if interior and exterior colors match
def extract_color_match(row):
    return 1 if row['ext_col'].lower() == row['int_col'].lower() else 0

# Function to extract if the car is a hybrid
def extract_hybrid(text):
    match = re.search(r'Hybrid', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract if the car is diesel
def extract_diesel(text):
    match = re.search(r'Diesel', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract if the car is gasolin
def extract_gasoline(text):
    match = re.search(r'Gasoline', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract if the car is electric
def extract_electric(text):
    match = re.search(r'Electric Motor', text, re.IGNORECASE)
    return 1 if match else 0

# Function to extract if the car is luxury
def extract_luxury(price):
    return 1 if price > 100000 else 0
    
# Function to extract mileage per year
def extract_mileage_per_year(row):
    vehicle_age = 2024 - row['model_year']
    return row['milage'] / max(vehicle_age, 1)

for dataset in datasets:
    # Apply the extraction functions to create new columns
    dataset['cylinders'] = dataset['engine'].astype(str).apply(extract_cylinders)
    dataset['horsepower'] = dataset['engine'].astype(str).apply(extract_hp)
    dataset['cubic_capacity'] = dataset['engine'].astype(str).apply(extract_capacity)
    dataset['turbo'] = dataset['engine'].astype(str).apply(extract_turbo)
    dataset['is_automatic'] = dataset['transmission'].astype(str).apply(extract_gear_type)
    dataset['gears'] = dataset['transmission'].astype(str).apply(extract_gears)
    dataset['dual_shift'] = dataset['transmission'].astype(str).apply(extract_dual_shift)
    dataset['ext_col_mon'] = dataset['ext_col'].astype(str).apply(extract_monochrome_color)
    dataset['int_col_mon'] = dataset['int_col'].astype(str).apply(extract_monochrome_color)
    dataset['color_match'] = dataset.apply(lambda row: extract_color_match(row), axis=1)
    dataset['is_hybrid'] = dataset['fuel_type'].astype(str).apply(extract_hybrid)
    dataset['is_diesel'] = dataset['fuel_type'].astype(str).apply(extract_diesel)
    dataset['is_gasoline'] = dataset['fuel_type'].astype(str).apply(extract_gasoline)
    dataset['is_electric'] = dataset['engine'].astype(str).apply(extract_electric)
    dataset['is_luxury'] = dataset['price'].apply(extract_luxury)
    #dataset['mileage_per_year'] = dataset.apply(lambda row: extract_mileage_per_year(row), axis=1)

### Remove outliers

In [None]:
for i, dataset in enumerate(datasets):
    # Calculate the z-scores of the price column
    z_scores = stats.zscore(dataset['price'])

    print(f"Remove {len(dataset) - len(dataset[(z_scores < 3)])} outliers from dataset")
    
    # Filter the dataset to remove rows where the z-score of the price is greater than 3
    dataset = dataset[(z_scores < 3)]

    datasets[i] = dataset

### Encode categorical features

In [None]:
for dataset in datasets:
    # Calculate the median prices per brand
    median_prices = dataset.groupby('brand')['price'].median()

    # Sort the brands based on the median prices
    sorted_brands = median_prices.sort_values()

    # Categorize the brands based on the median prices into 4 bins
    brand_bins = pd.qcut(sorted_brands, 4, labels=False)

    # Replace the brand names with the bin numbers
    dataset.loc[:, 'brand'] = dataset['brand'].map(lambda x: brand_bins[x])

### Compute pairwise correlations

In [None]:
dataset = datasets[0] # Use the original training dataset

# Select only numeric columns
non_numeric_columns = dataset.select_dtypes(exclude=[np.number]).columns

# Drop non-numeric columns
dataset = dataset.drop(columns=non_numeric_columns)

# Calculate the Spearman correlation matrix
spearman_corr = dataset.corr(method='spearman')

# Plot the correlation matrix
plt.figure(figsize=(16, 10))
sns.heatmap(spearman_corr, annot=True, fmt=".2f", cmap='coolwarm')

High correlations between:
- `Model_Year` and `Mileage`
- `Cylinders`, `Cubic_Capacity` and `Horsepower`

### Handle highly correlated features

Correlation between `cubic_capacity`, `horsepower` and `cylinders`.

In [None]:
dataset = datasets[0] # Use the original training dataset

# Calculate the number of rows with available values for horsepower, cubic_capacity, and cylinders
available_horsepower = dataset['horsepower'].notnull()
available_cubic_capacity = dataset['cubic_capacity'].notnull()
available_cylinders = dataset['cylinders'].notnull()

# Calculate the number of rows with available values
all_available = available_horsepower & available_cubic_capacity & available_cylinders
two_available = (available_horsepower & available_cubic_capacity & ~available_cylinders) | \
                (available_horsepower & ~available_cubic_capacity & available_cylinders) | \
                (~available_horsepower & available_cubic_capacity & available_cylinders)
one_available = (available_horsepower & ~available_cubic_capacity & ~available_cylinders) | \
                        (~available_horsepower & available_cubic_capacity & ~available_cylinders) | \
                        (~available_horsepower & ~available_cubic_capacity & available_cylinders)
none_available = ~(available_horsepower | available_cubic_capacity | available_cylinders)

# Calculate the number of rows with available values for horsepower, cubic_capacity, and cylinders
sum_all_available = all_available.sum()
sum_two_available = two_available.sum()
sum_one_available = one_available.sum()
sum_none_available = none_available.sum()

# Calculate the percentage of rows with available values for horsepower, cubic_capacity, and cylinders
percentage_all_available = sum_all_available / len(dataset) * 100
percentage_two_available = sum_two_available / len(dataset) * 100
percentage_one_available = sum_one_available / len(dataset) * 100
percentage_none_available = sum_none_available / len(dataset) * 100

print(f"Rows with all available values: {sum_all_available} ({percentage_all_available:.2f}%)")
print(f"Rows with two available values: {sum_two_available} ({percentage_two_available:.2f}%)")
print(f"Rows with one available value: {sum_one_available} ({percentage_one_available:.2f}%)")
print(f"Rows with no available values: {sum_none_available} ({percentage_none_available:.2f}%)")
print(f"Rows with at least one available value: {sum_all_available + sum_two_available + sum_one_available} ({percentage_all_available + percentage_two_available + percentage_one_available:.2f}%)")

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler

for i, dataset in enumerate(datasets):
    # Create a copy of the dataset to avoid SettingWithCopyWarning
    dataset = dataset.copy()
    
    # Step 1: Impute missing values
    imputer = KNNImputer(n_neighbors=3)
    dataset.loc[:, ['horsepower', 'cubic_capacity', 'cylinders']] = imputer.fit_transform(dataset[['horsepower', 'cubic_capacity', 'cylinders']])

    # Step 2: Normalize the features
    scaler = RobustScaler()
    dataset.loc[:, ['horsepower', 'cubic_capacity', 'cylinders']] = scaler.fit_transform(dataset[['horsepower', 'cubic_capacity', 'cylinders']])

    # Step 3: Calculate the synthetic variable
    dataset['engine_score'] = dataset[['horsepower', 'cubic_capacity', 'cylinders']].mean(axis=1).round(3)
    dataset.loc[dataset[['horsepower', 'cubic_capacity', 'cylinders']].isnull().all(axis=1), 'engine_score'] = 0

    datasets[i] = dataset
    

### Feature selection

In [None]:
for i, dataset in enumerate(datasets):
    # Drop the non-numeric columns
    dataset = dataset.drop(columns=dataset.select_dtypes(exclude=[np.number]).columns)

    # Filter out highly co-correlated features
    dataset = dataset.drop(['cylinders', 'horsepower', 'cubic_capacity'], axis=1)

    # Calculate the Spearman correlation
    spearman_corr = dataset.corr(method='spearman')

    # Drop low correlation columns
    dataset = dataset.drop(columns=spearman_corr[spearman_corr['price'].abs() < 0.1].index)

    datasets[i] = dataset

### Correlation matrix heatmap

In [None]:
dataset = datasets[0] # Use the original training dataset

# Calculate the Spearman correlation
spearman_corr = dataset.corr(method='spearman')

# Plot the correlation matrix
plt.figure(figsize=(16, 10))
sns.heatmap(spearman_corr, annot=True, fmt=".2f", cmap='coolwarm')

### Persist preprocessed data

In [None]:
# Save the preprocessed dataset
datasets[0].to_csv('./data/1_Preprocessing/train.csv', index=False)
datasets[1].to_csv('./data/1_Preprocessing/train_generated_and_original.csv', index=False)
datasets[2].to_csv('./data/1_Preprocessing/test.csv', index=False)