# Kaggle Playground Series S4E9 Competition

Walter Reade, Ashley Chow. (2024). Regression of Used Car Prices. Kaggle. https://kaggle.com/competitions/playground-series-s4e9

## IMPORTS AND SETUP

In [1]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore harmless warnings
warnings.filterwarnings('ignore')

# Plot rcParams:
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.titlesize'] = 22
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.titleweight'] = 'bold'
plt.style.use('ggplot')

## DATA READ

In [2]:
# Read train df:
train = pd.read_csv(filepath_or_buffer='train.csv')
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [3]:
# Train columns:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [4]:
# Read test data:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


## PREPROCESSING

In [5]:
# Train data infomation:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [6]:
# Descriptive statistics of target variable:
np.round(train['price'].describe().T, 3)

count     188533.000
mean       43878.016
std        78819.522
min         2000.000
25%        17000.000
50%        30825.000
75%        49900.000
max      2954083.000
Name: price, dtype: float64

In [7]:
# Null values as percentage of total data:
(train.isnull().sum().sort_values(ascending=False) / len(train)) * 100

clean_title     11.360876
fuel_type        2.696080
accident         1.300568
id               0.000000
brand            0.000000
model            0.000000
model_year       0.000000
milage           0.000000
engine           0.000000
transmission     0.000000
ext_col          0.000000
int_col          0.000000
price            0.000000
dtype: float64

### NULL VALUE PROCESSING

In [11]:
# We will define functions for each preprocessing and feature engineering.
# This approach will allow us to apply same modifications to the test data.
# "clean_title" processor function:
def clean_title_processor(df: pd.DataFrame) -> None:
	"""
	This function deals with the clean_title
	null values.
	"""
	# Fill null values with "No". We assume that null values indicate "No".
	df['clean_title'] = df['clean_title'].fillna('No')

	return None

# "fuel_type" processor function:
def fuel_type_processor(df: pd.Dataframe) -> None:
	"""
	This function deals with fuel_type column's null values and wrong data
	entries. For example, Certain observations show that fuel_type is gasoline
	yet
	"""

In [16]:
# Investigation of "clean_title":
train['clean_title'].value_counts()

clean_title
Yes    167114
No      21419
Name: count, dtype: int64

In [10]:
# Transmission values exhibit certain redundant data. We will define a
# function to tackle this issue:
def fix_transmission(transmission_value: str) -> str:
    """Fix transmission value and remove redundant data"""
    
    # Standardize Automatic Transmissions (A/T)
    automatic_keywords = ['A/T', 'Automatic', 'Auto', 'AT']
    if any(keyword in transmission_value for keyword in automatic_keywords):
        if '8-Speed' in transmission_value:
            return '8-Speed Automatic'
        elif '10-Speed' in transmission_value:
            return '10-Speed Automatic'
        elif '9-Speed' in transmission_value:
            return '9-Speed Automatic'
        elif '7-Speed' in transmission_value:
            return '7-Speed Automatic'
        elif '6-Speed' in transmission_value:
            return '6-Speed Automatic'
        elif '5-Speed' in transmission_value:
            return '5-Speed Automatic'
        elif '4-Speed' in transmission_value:
            return '4-Speed Automatic'
        elif '2-Speed' in transmission_value:
            return '2-Speed Automatic'
        elif '1-Speed' in transmission_value:
            return '1-Speed Automatic'
        else:
            return 'Automatic'
    
    # Standardize Manual Transmissions (M/T)
    manual_keywords = ['M/T', 'Manual', 'Mt']
    if any(keyword in transmission_value for keyword in manual_keywords):
        if '8-Speed' in transmission_value:
            return '8-Speed Manual'
        elif '7-Speed' in transmission_value:
            return '7-Speed Manual'
        elif '6-Speed' in transmission_value:
            return '6-Speed Manual'
        elif '5-Speed' in transmission_value:
            return '5-Speed Manual'
        else:
            return 'Manual'
    
    # Standardize CVT (Continuously Variable Transmission)
    if 'CVT' in transmission_value:
        return 'CVT'

    # Handle special cases
    if 'Single-Speed Fixed Gear' in transmission_value:
        return 'Single-Speed Fixed Gear'
    if 'Transmission w/Dual Shift Mode' in transmission_value:
        return 'Dual Shift Mode'
    if 'Transmission Overdrive Switch' in transmission_value:
        return 'Transmission Overdrive Switch'
    
    # Handle undefined or special values
    if transmission_value in ['None', 'Variable', 'F', '2']:
        return 'Other'
    
    # Default case: return the original value if no match
    return transmission_value

## FEATURE ENGINEERING