# Kaggle Playground Series S4E9 Competition

Walter Reade, Ashley Chow. (2024). Regression of Used Car Prices. Kaggle. https://kaggle.com/competitions/playground-series-s4e9

## IMPORTS AND SETUP

In [1]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore harmless warnings
warnings.filterwarnings('ignore')

# Plot rcParams:
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.titlesize'] = 22
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.titleweight'] = 'bold'
plt.style.use('seaborn-v0_8-darkgrid')

## DATA READ

In [2]:
# Read train df:
train = pd.read_csv(filepath_or_buffer='train.csv')
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [3]:
# Train columns:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [4]:
# Read test data:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


## PREPROCESSING

In [5]:
# Train data infomation:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [6]:
# Descriptive statistics of target variable:
np.round(train['price'].describe().T, 3)

count     188533.000
mean       43878.016
std        78819.522
min         2000.000
25%        17000.000
50%        30825.000
75%        49900.000
max      2954083.000
Name: price, dtype: float64

In [5]:
# Null values as percentage of total data:
(train.isnull().sum().sort_values(ascending=False) / len(train)) * 100

clean_title     11.360876
fuel_type        2.696080
accident         1.300568
id               0.000000
brand            0.000000
model            0.000000
model_year       0.000000
milage           0.000000
engine           0.000000
transmission     0.000000
ext_col          0.000000
int_col          0.000000
price            0.000000
dtype: float64

### NULL VALUE PROCESSING

In [6]:
# We will define functions for each preprocessing and feature engineering.
# This approach will allow us to apply same modifications to the test data.
# "clean_title" processor function:
def clean_title_processor(df: pd.DataFrame) -> None:
	"""
	This function deals with the clean_title
	null values.
	"""
	# Fill null values with "No". We assume that null values indicate "No".
	df['clean_title'] = df['clean_title'].fillna('No')

	return None

import pandas as pd

def fuel_type_processor(df: pd.DataFrame) -> None:
    """
    This function deals with fuel_type column's null values and wrong data
    entries. For example, certain observations show that fuel_type is gasoline
    yet the car engine indicates an electric motor. Therefore, we generate the
    fuel_type column from the engine column.
 
    Please note that while we assign the values, we must not overwrite the 
    values that are already present. To clarify this logic, consider the 
    following example: If the value at index n contains "hybrid" in the engine 
    column, we can overwrite the fuel_type value when we search for "gasoline," 
    since the same index can contain the "Electric/Gasoline Hybrid" substring. 
    To avoid this problem, we will remove the indices that have already been 
    assigned. Please also note that this logic emphasizes the order of querying 
    as it will define the indices that will be removed.
    """
    
    # Define target keywords:
    target_keywords = {'Plug-In': 'Plug-In Hybrid',
                       'Hybrid': 'Hybrid',
                       'Flex Fuel': 'E85 Flex Fuel',
                       'Gasoline': 'Gasoline',
                       'Diesel': 'Diesel',
                       'Electric': 'Electric',
                       'DOHC': 'Gasoline',
                       'OHV': 'Gasoline'}
    
    # Track the taken indices:
    taken_indices = []
    
    for keyword in target_keywords:
        
        # Obtain the mask for filtering the data:
        if len(taken_indices) == 0:
            mask = df['engine'].str.contains(keyword, case=False)
        else:
            mask = (df['engine'].str.contains(keyword, case=False) 
                    & ~df.index.isin(taken_indices))

        # Append taken indices:
        taken_indices.extend(df[mask].index.to_list())
    
    # Fill others:
    df.loc[~df.index.isin(taken_indices), 'engine'] = 'None'
    
    return taken_indices

In [7]:
train['fuel_type'].value_counts()
temp = fuel_type_processor(train)
train.reset_index().query('index not in @temp')

Unnamed: 0,index,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
51,51,51,Tesla,Model 3 Long Range,2020,38360,,,Automatic,White,Black,None reported,,59598
212,212,212,Land,Rover Range Rover Evoque S,2022,12425,Gasoline,,Automatic,Firenze Red,Ebony,None reported,Yes,24991
237,237,237,Mercedes-Benz,GLE 350 GLE 350,2022,18655,Gasoline,,Automatic,designo Diamond White Metallic,Macchiato/Magmagrey,None reported,,39998
256,256,256,BMW,840 Gran Coupe i xDrive,2020,29336,Gasoline,,Automatic,Black Sapphire Metallic,Black,At least 1 accident or damage reported,,50658
271,271,271,Audi,Q5 S line Premium Plus,2022,10071,Hybrid,,Automatic,Mythos Black,Black,None reported,,44798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188444,188444,188444,Land,Rover Range Rover Sport 3.0L Supercharged HSE,2018,59355,Gasoline,,Automatic,Santorini Black Metallic,Ebony / Pimento,None reported,,23598
188448,188448,188448,Mercedes-Benz,S-Class S 560 4MATIC,2018,75965,Gasoline,,Automatic,White,Beige,None reported,,29998
188471,188471,188471,BMW,X7 xDrive40i,2022,38671,Hybrid,,Automatic,Gray,Blue,None reported,,65998
188520,188520,188520,Mercedes-Benz,GLC 300 GLC 300,2022,18031,Gasoline,,Automatic,White,Black,None reported,,39998


In [8]:
df = train  # aliasing:
df[df['engine'].str.contains('Electric', case=False)]

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
11,11,Tesla,Model S P100D,2015,81500,,Electric Motor Electric Fuel System,1-Speed A/T,Gray,White,None reported,Yes,19000
17,17,Land,Rover Defender SE,2021,46100,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Gray,Black,None reported,Yes,55000
23,23,Audi,A8 L 55,2018,51000,Hybrid,335.0HP 3.0L V6 Cylinder Engine Gasoline/Mild ...,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,76000
32,32,Tesla,Model S P100D,2018,30300,,518.0HP Electric Motor Electric Fuel System,A/T,White,White,None reported,Yes,64000
44,44,Rivian,R1S Adventure Package,2023,7000,Gasoline,835.0HP Electric Motor Electric Fuel System,8-Speed A/T,Green,Black,None reported,Yes,145000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188462,188462,Ford,Mustang Mach-E GT,2023,9000,,480.0HP Electric Motor Electric Fuel System,A/T,Orange,Black,None reported,Yes,85000
188469,188469,Mercedes-Benz,AMG GLS 63 4MATIC,2023,4100,Hybrid,603.0HP 4.0L 8 Cylinder Engine Gasoline/Mild E...,9-Speed A/T,Black,Black,None reported,Yes,149900
188484,188484,Ford,Model X P100D,2018,52000,,534.0HP Electric Motor Electric Fuel System,1-Speed A/T,Gray,Black,None reported,Yes,45000
188489,188489,Tesla,Model X Long Range Plus,2022,4786,,557.0HP Electric Motor Electric Fuel System,A/T,Black,Black,None reported,Yes,92000


In [9]:
train['engine'].value_counts()

engine
None                                                   8865
355.0HP 5.3L 8 Cylinder Engine Gasoline Fuel           3462
240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel           2902
420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel           2841
2.0L I4 16V GDI DOHC Turbo                             2680
                                                       ... 
151.0HP 1.5L 4 Cylinder Engine Gas/Electric Hybrid        1
184.0HP 2.4L 4 Cylinder Engine Flex Fuel Capability       1
78.0HP 1.2L 3 Cylinder Engine Gasoline Fuel               1
139.0HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas       1
313.0HP 2.0L 4 Cylinder Engine Plug-In Electric/Gas       1
Name: count, Length: 1060, dtype: int64

In [10]:
# Investigation of "clean_title":
train['clean_title'].value_counts()

clean_title
Yes    167114
Name: count, dtype: int64

In [14]:
# Transmission values exhibit certain redundant data. We will define a
# function to tackle this issue:
def fix_transmission(transmission_value: str) -> str:
    """Fix transmission value and remove redundant data"""
    
    # Standardize Automatic Transmissions (A/T)
    automatic_keywords = ['A/T', 'Automatic', 'Auto', 'AT']
    if any(keyword in transmission_value for keyword in automatic_keywords):
        if '8-Speed' in transmission_value:
            return '8-Speed Automatic'
        elif '10-Speed' in transmission_value:
            return '10-Speed Automatic'
        elif '9-Speed' in transmission_value:
            return '9-Speed Automatic'
        elif '7-Speed' in transmission_value:
            return '7-Speed Automatic'
        elif '6-Speed' in transmission_value:
            return '6-Speed Automatic'
        elif '5-Speed' in transmission_value:
            return '5-Speed Automatic'
        elif '4-Speed' in transmission_value:
            return '4-Speed Automatic'
        elif '2-Speed' in transmission_value:
            return '2-Speed Automatic'
        elif '1-Speed' in transmission_value:
            return '1-Speed Automatic'
        else:
            return 'Automatic'
    
    # Standardize Manual Transmissions (M/T)
    manual_keywords = ['M/T', 'Manual', 'Mt']
    if any(keyword in transmission_value for keyword in manual_keywords):
        if '8-Speed' in transmission_value:
            return '8-Speed Manual'
        elif '7-Speed' in transmission_value:
            return '7-Speed Manual'
        elif '6-Speed' in transmission_value:
            return '6-Speed Manual'
        elif '5-Speed' in transmission_value:
            return '5-Speed Manual'
        else:
            return 'Manual'
    
    # Standardize CVT (Continuously Variable Transmission)
    if 'CVT' in transmission_value:
        return 'CVT'

    # Handle special cases
    if 'Single-Speed Fixed Gear' in transmission_value:
        return 'Single-Speed Fixed Gear'
    if 'Transmission w/Dual Shift Mode' in transmission_value:
        return 'Dual Shift Mode'
    if 'Transmission Overdrive Switch' in transmission_value:
        return 'Transmission Overdrive Switch'
    
    # Handle undefined or special values
    if transmission_value in ['None', 'Variable', 'F', '2']:
        return 'Other'
    
    # Default case: return the original value if no match
    return transmission_value

## FEATURE ENGINEERING