# KaggleX Skill Assessment Challenge

Notebook includes process of feature engineering and regression prediction, as part of program application.

Additional cells containing evaluation of alternate strategies, hyperparameter tuning, data quality checks & validation, visualizations, debugging, test set transformations, and file export have largely been removed for presentation.

In [1]:
#Imports
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import re
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import seaborn.objects as so
import numpy as np

 #Configure pandas for high dimensionality dataframes
pd.set_option('display.max_columns', None)

In [2]:
#Train imports

 #Challenge train
df_tr_ch = pd.read_csv('kaggleX_train.csv')
 #no missing values

 #Extra train (not provided in challenge)
df_tr_ex = pd.read_csv('kaggleX_used_cars.csv')
 #missing 'fuel type': 83% 'gasoline', 5% 'hybrid', 8% other (7 unique values total), 4% (170) missing
          #'accident': 73% 'none', 24% '>=1', 3% (113) missing
       #'clean_title': 85% 'true', no 'false', 15% (596) missing

#Test import

 #Challenge test
df_te = pd.read_csv('kaggleX_test.csv')

In [3]:
#Preprocessing: challenge train
#Add 'id' column to extra set for concat + later cleaning
df_tr_ex['id'] = df_tr_ex.index

#Preview challenge train
print(df_tr_ch.shape, df_tr_ch.dtypes, sep='\n')
df_tr_ch.head(1)

(54273, 13)
id               int64
brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
price            int64
dtype: object


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000


In [4]:
#Preprocessing: extra train
 #Clean and convert milage and price columns
def clean_and_convert(col):
    return col.replace(r'[^0-9]', '', regex=True).astype(int)

df_tr_ex['milage'] = clean_and_convert(df_tr_ex['milage'])
df_tr_ex['price'] = clean_and_convert(df_tr_ex['price'])

#Preview extra train
print(df_tr_ex.shape, df_tr_ex.dtypes, sep='\n')
df_tr_ex.head(1)

(4009, 13)
brand           object
model           object
model_year       int64
milage           int32
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
price            int32
id               int64
dtype: object


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,id
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300,0


In [5]:
#Preprocessing: challenge test
df_te = df_te.drop(df_te[['id']], axis=1)
#Preview test set
print(df_te.shape, df_te.dtypes, sep='\n')
df_te.head(1)

(36183, 11)
brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
dtype: object


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes


In [6]:
#Compare column similarity

#Extract column names from all DataFrames
columns_df_tr = set(df_tr_ch.columns)
columns_df_tr_ex = set(df_tr_ex.columns)
columns_df_te = set(df_te.columns)

#Get union of all columns
all_columns = columns_df_tr.union(columns_df_tr_ex).union(columns_df_te)

#Create comparison DataFrame directly using dictionary comprehension
comparison_data = {
    'columns': list(all_columns),
    'df_tr_ch': [col in columns_df_tr for col in all_columns],
    'df_tr_ex': [col in columns_df_tr_ex for col in all_columns],
    'df_te': [col in columns_df_te for col in all_columns]
}

#To df
comparison_df = pd.DataFrame(comparison_data)

#Name index
comparison_df.set_index('columns', inplace=True)

comparison_df.T

columns,model_year,model,id,accident,brand,int_col,clean_title,transmission,ext_col,fuel_type,milage,price,engine
df_tr_ch,True,True,True,True,True,True,True,True,True,True,True,True,True
df_tr_ex,True,True,True,True,True,True,True,True,True,True,True,True,True
df_te,True,True,False,True,True,True,True,True,True,True,True,False,True


In [7]:
#Output whether column values between dataframes are the same, a contained set of the other, or different

def compare_columns(col1, col2, col3=None):
    set_col1 = set(col1.dropna().unique())
    set_col2 = set(col2.dropna().unique())
    if col3 is not None:
        set_col3 = set(col3.dropna().unique())

    if set_col1 == set_col2 == (set_col3 if col3 is not None else set()):
        return 'same'
    elif col3 is not None and set_col1.issubset(set_col2) and set_col1.issubset(set_col3) and set_col2.issubset(set_col3):
        return 'same or contained'
    elif col3 is not None and set_col2.issubset(set_col1) and set_col2.issubset(set_col3) and set_col1.issubset(set_col3):
        return 'same or contained'
    elif col3 is not None and set_col3.issubset(set_col1) and set_col3.issubset(set_col2) and set_col1.issubset(set_col2):
        return 'same or contained'
    elif set_col1.issubset(set_col2):
        if col3 is not None and set_col1.issubset(set_col3):
            return 'df_te contained in df_tr_ex'
        else:
            return 'df_tr_ch contained in df_tr_ex'
    elif set_col2.issubset(set_col1):
        if col3 is not None and set_col2.issubset(set_col3):
            return 'df_te contained in df_tr_ch'
        else:
            return 'df_tr_ex contained in df_tr_ch'
    elif col3 is not None and set_col3.issubset(set_col1) and set_col3.issubset(set_col2):
        return 'df_tr_ex contained in df_te'
    elif col3 is not None and set_col3.issubset(set_col1):
        return 'df_tr_ex contained in df_te'
    elif col3 is not None and set_col3.issubset(set_col2):
        return 'df_te contained in df_tr_ex'
    else:
        return 'different'

# Initialize the result dictionary
comparison_result = {}

# Compare columns
for col in all_columns:
    try:
        comparison_result[col] = {
            'df_tr_ch': compare_columns(df_tr_ch[col], df_tr_ex[col], df_te[col]),
            'df_tr_ex': compare_columns(df_tr_ex[col], df_te[col], df_tr_ch[col]),
            'df_te': compare_columns(df_tr_ch[col], df_te[col], df_tr_ex[col])
        }
    except KeyError:
        print(f"'{col}' column is missing in one of the DataFrames.")

# Create the DataFrame directly from comparison_result
df_range = pd.DataFrame(comparison_result)
df_range

'id' column is missing in one of the DataFrames.
'price' column is missing in one of the DataFrames.


Unnamed: 0,model_year,model,accident,brand,int_col,clean_title,transmission,ext_col,fuel_type,milage,engine
df_tr_ch,same,df_tr_ch contained in df_tr_ex,same,df_tr_ch contained in df_tr_ex,df_tr_ch contained in df_tr_ex,same,df_tr_ch contained in df_tr_ex,df_tr_ch contained in df_tr_ex,same,different,df_tr_ch contained in df_tr_ex
df_tr_ex,same,df_tr_ex contained in df_tr_ch,same,df_tr_ex contained in df_tr_ch,df_tr_ex contained in df_tr_ch,same,df_tr_ex contained in df_tr_ch,df_tr_ex contained in df_tr_ch,same,different,df_tr_ex contained in df_tr_ch
df_te,same,different,same,different,different,same,different,different,same,different,different


In [8]:
#Compare attribute types
pd.DataFrame({'Challenge':df_tr_ch.dtypes, 'Extra':df_tr_ex.dtypes, 'Test':df_te.dtypes}).T

Unnamed: 0,accident,brand,clean_title,engine,ext_col,fuel_type,id,int_col,milage,model,model_year,price,transmission
Challenge,object,object,object,object,object,object,int64,object,int64,object,int64,int64,object
Extra,object,object,object,object,object,object,int64,object,int32,object,int64,int32,object
Test,object,object,object,object,object,object,,object,int64,object,int64,,object


In [9]:
#Combine df_tr_ch and df_tr_ex
df_tr = pd.concat([df_tr_ch, df_tr_ex], ignore_index=True)

print(df_tr.isna().sum())
df_tr.head(2)

id                0
brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250


In [10]:
#Clean 'accident' + 'clean_title'
print(df_tr.accident.value_counts())
df_tr.clean_title.value_counts()

accident
None reported                             42806
At least 1 accident or damage reported    15363
Name: count, dtype: int64


clean_title
Yes    57686
Name: count, dtype: int64

In [11]:
 #Fill accident NaN
df_tr['accident'] = df_tr['accident'].replace({np.nan: 'None reported'})
print(df_tr.accident.isna().sum())
 #Fill clean_title NaN
df_tr['clean_title'] = df_tr['clean_title'].replace({np.nan: 'No'})
print(df_tr.clean_title.isna().sum())

0
0


In [12]:
#Fuel type: no 'electric' values
print(len(df_tr.fuel_type.value_counts()))
pd.DataFrame(df_tr.fuel_type.value_counts())

7


Unnamed: 0_level_0,count
fuel_type,Unnamed: 1_level_1
Gasoline,52748
Hybrid,1960
E85 Flex Fuel,1618
Diesel,1225
–,339
Plug-In Hybrid,216
not supported,6


In [13]:
#Examine 'not supported' records
 #Following function will convert Armada to Gasoline and Mirais (Hydrogen fuel cell) to Electric
df_tr[df_tr['fuel_type']=='not supported'].head(6)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
1069,1069,Toyota,Mirai Base,2016,40000,not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,14000
9621,9621,Toyota,Mirai Limited,2023,29553,not supported,182.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Gray,None reported,Yes,9995
11441,11441,Toyota,Mirai Limited,2018,40000,not supported,182.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Gray,None reported,Yes,7500
21771,21771,Nissan,Armada Platinum,2017,92000,not supported,390.0HP 5.6L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,20900
57167,2894,Toyota,Mirai Limited,2021,44000,not supported,182.0HP Electric Motor Hydrogen Fuel,A/T,Gray,Black,None reported,Yes,18500
57973,3700,Toyota,Mirai Base,2016,40000,not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,9500


In [14]:
#Fuel type: Extract relevant info from 'engine' to minimize isna() drops

def extract_fuel_type(engine):
    if "Gasoline" in engine:
        return "Gasoline"
    elif "Hybrid" in engine:
        return "Hybrid"
    elif "Electric" in engine:
        return "Electric"
 #edge cases
    elif "Standard Range Battery" in engine:
        return "Electric"
    elif "111.2Ah / FR 70kW / RR 160kW (697V)" in engine:
        return "Electric"
    elif "151.0HP Electric Motor Hydrogen Fuel" in engine:
        return "Electric"
    elif "182.0HP Electric Motor Hydrogen Fuel" in engine:
        return "Electric"
    elif "Dual Motor - Standard" in engine:
        return "Hybrid"
    elif engine == "–":
        return "Gasoline"
    else:
        return 'Exception'

#Apply function to 'engine' where 'fuel_type' is missing or 'not supported'
df_tr['fuel_type'] = df_tr.apply(
    lambda row: extract_fuel_type(row['engine']) if pd.isna(row['fuel_type']) or row['fuel_type'] == 'not supported' else row['fuel_type'], axis=1
)

#Replace '-' fuel types with Gasoline (cars with this value are mostly gas)
df_tr['fuel_type'] = df_tr['fuel_type'].replace({'–': 'Gasoline'})

#Check missing values
print(df_tr.fuel_type.isna().sum())

0


In [15]:
df_tr.loc[df_tr.index==21771]

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
21771,21771,Nissan,Armada Platinum,2017,92000,Gasoline,390.0HP 5.6L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,20900


In [16]:
df_tr.loc[df_tr.index==1069]

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
1069,1069,Toyota,Mirai Base,2016,40000,Electric,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,14000


In [17]:
#Review Fuel type values post-cleaning/with 'electric' extracted
print(len(df_tr.fuel_type.value_counts()))
pd.DataFrame(df_tr.fuel_type.value_counts())

6


Unnamed: 0_level_0,count
fuel_type,Unnamed: 1_level_1
Gasoline,53088
Hybrid,1963
E85 Flex Fuel,1618
Diesel,1225
Plug-In Hybrid,216
Electric,172


In [18]:
#Clean 'engine'
 #Review values: informative but highly specific
print('Total engine values:', len(df_tr.engine.value_counts()))
pd.concat([df_tr.engine.value_counts().head(2), df_tr.engine.value_counts().tail(2)])

Total engine values: 1146


engine
300.0HP 3.0L Straight 6 Cylinder Engine Gasoline Fuel    1496
355.0HP 5.3L 8 Cylinder Engine Gasoline Fuel             1173
Dual AC Electric Motors                                     1
417.0HP Electric Motor Electric Fuel System                 1
Name: count, dtype: int64

In [19]:
 #Extract 'engine' info to various columns and drop to avoid one-hot encoding 1146 'engine' features 
  #Function to extract key engine info
def extract_engine_info(engine_str):

    # Extract HP
    hp_match = re.search(r'(\d+\.?\d*)[-]?HP', engine_str, re.IGNORECASE)
    hp = int(round(float(hp_match.group(1)))) if hp_match else 0

    # Extract L preceding value
    liters_match = re.search(r'(\d+\.?\d*)[-]?L', engine_str, re.IGNORECASE)
    liters = float(liters_match.group(1)) if liters_match else 0
    
    # Extract Liter preceding value
    liters_value_match = re.search(r'(\d+\.?\d*)\s+Liter', engine_str, re.IGNORECASE)
    liters_value = float(liters_value_match.group(1)) if liters_value_match else 0

    # Extract Cylinder
    cylinder_match = re.search(r'(\d+) Cylinder', engine_str, re.IGNORECASE)
    cylinder = int(cylinder_match.group(1)) if cylinder_match else 0
    
    # Extract V_number (handling V8, V-8, etc.)
    v_number_match = re.search(r'V[-]?(\d+)', engine_str, re.IGNORECASE)
    v_number = int(v_number_match.group(1)) if v_number_match else 0

    # Extract I_number
    i_number_match = re.search(r'I[-]?(\d+)', engine_str, re.IGNORECASE)
    i_number = int(i_number_match.group(1)) if i_number_match else 0

    # Extract H_number
    h_number_match = re.search(r'H[-]?(\d+)', engine_str, re.IGNORECASE)
    h_number = int(h_number_match.group(1)) if h_number_match else 0

    # Extract W_number
    w_number_match = re.search(r'W[-]?(\d+)', engine_str, re.IGNORECASE)
    w_number = int(w_number_match.group(1)) if w_number_match else 0

 #Extract fuel type if 'Electric' is before 'Fuel'
    elec_fuel_match = re.search(r'(\w+)\s+Fuel', engine_str, re.IGNORECASE)
    elec_fuel = None
    if elec_fuel_match:
        elec_fuel = elec_fuel_match.group(1)
        if elec_fuel.lower() != 'electric':
            elec_fuel = 0

    return hp, liters, liters_value, cylinder, v_number, i_number, h_number, w_number, elec_fuel
    
 #Apply function to the 'engine' column
df_tr[['hp', 'liters', 'liters_value', 'cylinder', 'V_num', 'I_num', 'H_num', 'W_num', 'e_fuel']] = df_tr['engine'].apply(lambda x: pd.Series(extract_engine_info(x)))

 #Compare liters and keep larger
df_tr['liters'] = df_tr[['liters', 'liters_value']].max(axis=1)

 #Compare cylinder extraction columns element-wise and keep largest
df_tr['cylinders'] = df_tr[['cylinder', 'V_num', 'I_num', 'H_num', 'W_num']].max(axis=1)

 #Fill 'fuel_type' with electric where 'e_fuel' is Electric
df_tr.loc[df_tr['e_fuel'] == 'Electric', 'fuel_type'] = 'Electric'

 #Fill 'cylinders' with 0 where 'fuel_type' is 'Electric'
df_tr.loc[df_tr['fuel_type'] == 'Electric', 'cylinders'] = 0

 #Consolidate Hybrid values
df_tr['fuel_type'] = df_tr['fuel_type'].replace(['Plug-In Hybrid'], 'Hybrid')

 #Drop model, engine + intermediate columns
df_tr.drop(['model','engine', 'liters_value', 'cylinder', 'V_num', 'I_num', 'H_num', 'W_num', 'e_fuel'], axis=1, inplace=True)

#Review transformed records
df_tr.head(2)

Unnamed: 0,id,brand,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,hp,liters,cylinders
0,0,Ford,2018,74349,Gasoline,10-Speed A/T,Blue,Gray,None reported,Yes,11000,375.0,3.5,6.0
1,1,BMW,2007,80000,Gasoline,6-Speed M/T,Black,Black,None reported,Yes,8250,300.0,3.0,6.0


In [20]:
#fuel_type down from 7 to 5 categories, with greater validity
pd.DataFrame(df_tr.fuel_type.value_counts())

Unnamed: 0_level_0,count
fuel_type,Unnamed: 1_level_1
Gasoline,52920
Hybrid,2171
E85 Flex Fuel,1618
Diesel,1225
Electric,348


In [21]:
#Verify electric cars have no cylinders
df_tr[df_tr['fuel_type']=='Electric'].cylinders.describe()

count    348.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: cylinders, dtype: float64

In [22]:
#Clean transmission column
print('Total transmission values:', len(df_tr.transmission.value_counts()))
 #Values + counts
pd.concat([df_tr.transmission.value_counts().head(2), df_tr.transmission.value_counts().tail(2)])

Total transmission values: 62


transmission
A/T                                      17794
8-Speed A/T                               7693
Automatic, 7-Spd S tronic Dual-Clutch        1
8-SPEED A/T                                  1
Name: count, dtype: int64

In [23]:
#Extract transmission info (speeds; types)

 #Functions to extract speeds
def extract_speeds1(transmission_str):
    speeds_match = re.search(r'(\d+)[-]?Speed', transmission_str, re.IGNORECASE)
    return int(speeds_match.group(1)) if speeds_match else 6

def extract_speeds2(transmission_str):
    speeds_match = re.search(r'(\d+)[-]?Spd', transmission_str, re.IGNORECASE)
    return int(speeds_match.group(1)) if speeds_match else None

 #Apply
df_tr.loc[:, 'speeds1'] = df_tr['transmission'].apply(extract_speeds1)
df_tr.loc[:, 'speeds2'] = df_tr['transmission'].apply(extract_speeds2)

 #Compare speeds and keep larger
df_tr['speeds'] = df_tr[['speeds1', 'speeds2']].max(axis=1)

In [24]:
 #Extract transmission types
transmission_mapping = {
    'Both': ['Dual', 'At/Mt'],
    'Manual': ['Mt', 'M/T', 'Manual'],
    'Automatic': ['A/T', 'Auto', 'Overdrive', 'CVT']
}

def extract_trx(transmission_value):
    transmission_value = str(transmission_value).lower()
    for trx_type, keywords in transmission_mapping.items():
        for keyword in keywords:
            if re.search(keyword, transmission_value, re.IGNORECASE):
                return trx_type
    return 'Unknown'

df_tr['trx'] = df_tr['transmission'].apply(extract_trx)

 #Edge cases
  #Fill 'trx' with 'Electric' where 'fuel_type' is 'Electric'
df_tr.loc[df_tr['fuel_type'] == 'Electric', 'trx'] = 'Electric'
  #Replace 3 1974 cars with 'Manual'
df_tr.loc[df_tr['id'] == 12252, 'trx'] = 'Manual'
df_tr.loc[df_tr['id'] == 36639, 'trx'] = 'Manual'
df_tr.loc[df_tr['id'] == 855, 'trx'] = 'Manual'
  #Replace remainder with 'Automatic'
df_tr.loc[df_tr['trx'] == 'Unknown', 'trx'] = 'Automatic'

 #Final steps
  #Drop id col
df_tr.drop('id', axis=1, inplace=True)
  #Drop intermediate + transmission columns
df_tr.drop(['speeds1', 'speeds2', 'transmission'], axis=1, inplace=True)
df_tr['price'] = df_tr.pop('price')

df_tr.head(1)

Unnamed: 0,brand,model_year,milage,fuel_type,ext_col,int_col,accident,clean_title,hp,liters,cylinders,speeds,trx,price
0,Ford,2018,74349,Gasoline,Blue,Gray,None reported,Yes,375.0,3.5,6.0,10.0,Automatic,11000


In [25]:
#Condense exterior colours
df_tr['ext_col'].value_counts().tail(3)

ext_col
Eiger Grey Metallic          1
Billet Clearcoat Metallic    1
Reflex Silver                1
Name: count, dtype: int64

In [26]:
 #Using key words
df_tr.loc[df_tr['ext_col'].str.contains('Red', case=False), 'ext_col'] = 'Red'
df_tr.loc[df_tr['ext_col'].str.contains('Rosso', case=False), 'ext_col'] = 'Red'
df_tr.loc[df_tr['ext_col'].str.contains('Ruby', case=False), 'ext_col'] = 'Red'
df_tr.loc[df_tr['ext_col'].str.contains('Purple', case=False), 'ext_col'] = 'Purple'
df_tr.loc[df_tr['ext_col'].str.contains('Yellow', case=False), 'ext_col'] = 'Yellow'
df_tr.loc[df_tr['ext_col'].str.contains('Hellayella', case=False), 'ext_col'] = 'Yellow'
df_tr.loc[df_tr['ext_col'].str.contains('Orange', case=False), 'ext_col'] = 'Orange'
df_tr.loc[df_tr['ext_col'].str.contains('Mango', case=False), 'ext_col'] = 'Orange'
df_tr.loc[df_tr['ext_col'].str.contains('Arancio', case=False), 'ext_col'] = 'Orange'
df_tr.loc[df_tr['ext_col'].str.contains('Bronze', case=False), 'ext_col'] = 'Bronze'
df_tr.loc[df_tr['ext_col'].str.contains('Brown', case=False), 'ext_col'] = 'Brown'
df_tr.loc[df_tr['ext_col'].str.contains('Sandstone', case=False), 'ext_col'] = 'Brown'
df_tr.loc[df_tr['ext_col'].str.contains('Beige', case=False), 'ext_col'] = 'Beige'
df_tr.loc[df_tr['ext_col'].str.contains('Tan', case=False), 'ext_col'] = 'Beige'
df_tr.loc[df_tr['ext_col'].str.contains('Gold', case=False), 'ext_col'] = 'Gold'
df_tr.loc[df_tr['ext_col'].str.contains('Green', case=False), 'ext_col'] = 'Green'
df_tr.loc[df_tr['ext_col'].str.contains('Moss', case=False), 'ext_col'] = 'Green'
df_tr.loc[df_tr['ext_col'].str.contains('Gecko', case=False), 'ext_col'] = 'Green'
df_tr.loc[df_tr['ext_col'].str.contains('Verde', case=False), 'ext_col'] = 'Green'
df_tr.loc[df_tr['ext_col'].str.contains('Silver', case=False), 'ext_col'] = 'Silver'
df_tr.loc[df_tr['ext_col'].str.contains('Platinum', case=False), 'ext_col'] = 'Silver'
df_tr.loc[df_tr['ext_col'].str.contains('Gray', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Grey', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Granite', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Grigio', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Charcoal', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Slate', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Graphite', case=False), 'ext_col'] = 'Gray'
df_tr.loc[df_tr['ext_col'].str.contains('Blue', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('Blu', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('Ember', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('Tempest', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('Sapphire', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('Navy', case=False), 'ext_col'] = 'Blue'
df_tr.loc[df_tr['ext_col'].str.contains('White', case=False), 'ext_col'] = 'White'
df_tr.loc[df_tr['ext_col'].str.contains('Chalk', case=False), 'ext_col'] = 'White'
df_tr.loc[df_tr['ext_col'].str.contains('Bianco', case=False), 'ext_col'] = 'White'
df_tr.loc[df_tr['ext_col'].str.contains('Black', case=False), 'ext_col'] = 'Black'
df_tr.loc[df_tr['ext_col'].str.contains('Ebony', case=False), 'ext_col'] = 'Black'
df_tr.loc[df_tr['ext_col'].str.contains('Onyx', case=False), 'ext_col'] = 'Black'
df_tr.loc[df_tr['ext_col'].str.contains('Nero', case=False), 'ext_col'] = 'Black'
df_tr.loc[df_tr['ext_col'].str.contains('Noir', case=False), 'ext_col'] = 'Black'
df_tr.loc[df_tr['ext_col'].str.contains('Obsidian', case=False), 'ext_col'] = 'Black'

In [27]:
 #Using specific values
df_tr['ext_col'] = df_tr['ext_col'].replace(['Iridescent Pearl Tricoat', 'Wind Chill Pearl', 'Rift Metallic', 'Glacier', 'Yulong', 'Bianco Monocerus'], 'White')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Manhattan Noir Metallic', 'Crystal Black'], 'Black')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Sunset Drift Chromaflair'], 'Orange')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Satin Steel Metallic'], 'Silver')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Dark Ash Metallic', 'Majestic Plum Metallic'], 'Purple')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Medium Stone', 'Tan', 'Sandstone Metallic'], 'Beige')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Maroon', 'Rich Garnet Metallic', 'Ametrin Metallic', 'Caviar', 'Cayenne Red Tintcoat'], 'Red')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Dark Matter Metallic', 'Billet Clearcoat Metallic','Magnetic Metallic', 'Tungsten Metallic', 'Iridium Metallic', 'Gun Metallic', 'Designo Magno Matte', 'Carpathian Grey'], 'Gray')
df_tr['ext_col'] = df_tr['ext_col'].replace(['Maximum Steel Metallic', 'Ice', 'Moonlight Cloud', 'Nightfall Mica', 'Mountain Air Metallic', 'Lunar Rock', 'Stormy Sea', 'Caspian Blue'], 'Blue')
df_tr['ext_col'] = df_tr['ext_col'].replace(['C / C', '–', 'Metallic', 'Custom Color'], 'Other')

In [28]:
#Condense interior colours
df_tr['int_col'].value_counts().tail(3)

int_col
Pimento / Ebony         1
Ebony.                  1
Gray w/Blue Bolsters    1
Name: count, dtype: int64

In [29]:
 #Using key words
df_tr.loc[df_tr['int_col'].str.contains('Red', case=False), 'int_col'] = 'Red'
df_tr.loc[df_tr['int_col'].str.contains('Rosso', case=False), 'int_col'] = 'Red'
df_tr.loc[df_tr['int_col'].str.contains('Ruby', case=False), 'int_col'] = 'Red'
df_tr.loc[df_tr['int_col'].str.contains('Hotspur', case=False), 'int_col'] = 'Red'
df_tr.loc[df_tr['int_col'].str.contains('Garnet', case=False), 'int_col'] = 'Red'
df_tr.loc[df_tr['int_col'].str.contains('Purple', case=False), 'int_col'] = 'Purple'
df_tr.loc[df_tr['int_col'].str.contains('Yellow', case=False), 'int_col'] = 'Yellow'
df_tr.loc[df_tr['int_col'].str.contains('Hellayella', case=False), 'int_col'] = 'Yellow'
df_tr.loc[df_tr['int_col'].str.contains('Orange', case=False), 'int_col'] = 'Orange'
df_tr.loc[df_tr['int_col'].str.contains('Mango', case=False), 'int_col'] = 'Orange'
df_tr.loc[df_tr['int_col'].str.contains('Arancio', case=False), 'int_col'] = 'Orange'
df_tr.loc[df_tr['int_col'].str.contains('Amber', case=False), 'int_col'] = 'Orange'
df_tr.loc[df_tr['int_col'].str.contains('Bronze', case=False), 'int_col'] = 'Bronze'
df_tr.loc[df_tr['int_col'].str.contains('Brown', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Sandstone', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Espresso', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Caramel', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Mocha', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Camel', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Cappuccino', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Walnut', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Brandy', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Auburn', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Chestnut', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Cocoa', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Portland', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Ceramic', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Mesa', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Tupelo', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Roast', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Chateau', case=False), 'int_col'] = 'Brown'
df_tr.loc[df_tr['int_col'].str.contains('Beige', case=False), 'int_col'] = 'Beige'
df_tr.loc[df_tr['int_col'].str.contains('Tan', case=False), 'int_col'] = 'Beige'
df_tr.loc[df_tr['int_col'].str.contains('Macchiato', case=False), 'int_col'] = 'Beige'
df_tr.loc[df_tr['int_col'].str.contains('Linen', case=False), 'int_col'] = 'Beige'
df_tr.loc[df_tr['int_col'].str.contains('Oyster', case=False), 'int_col'] = 'Beige'
df_tr.loc[df_tr['int_col'].str.contains('Gold', case=False), 'int_col'] = 'Gold'
df_tr.loc[df_tr['int_col'].str.contains('Green', case=False), 'int_col'] = 'Green'
df_tr.loc[df_tr['int_col'].str.contains('Moss', case=False), 'int_col'] = 'Green'
df_tr.loc[df_tr['int_col'].str.contains('Gecko', case=False), 'int_col'] = 'Green'
df_tr.loc[df_tr['int_col'].str.contains('Verde', case=False), 'int_col'] = 'Green'
df_tr.loc[df_tr['int_col'].str.contains('Cypress', case=False), 'int_col'] = 'Green'
df_tr.loc[df_tr['int_col'].str.contains('Silver', case=False), 'int_col'] = 'Silver'
df_tr.loc[df_tr['int_col'].str.contains('Platinum', case=False), 'int_col'] = 'Silver'
df_tr.loc[df_tr['int_col'].str.contains('Gray', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Grey', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Granite', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Grigio', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Charcoal', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Slate', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Graphite', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Shale', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Ash', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Pewter', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Anthracite', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Porpoise', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Gideon', case=False), 'int_col'] = 'Gray'
df_tr.loc[df_tr['int_col'].str.contains('Blue', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('Blu', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('Navy', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('Ember', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('Tempest', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('Sapphire', case=False), 'int_col'] = 'Blue'
df_tr.loc[df_tr['int_col'].str.contains('White', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Chalk', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Bianco', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Parchment', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Cloud', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Beluga', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Orchid', case=False), 'int_col'] = 'White'
df_tr.loc[df_tr['int_col'].str.contains('Black', case=False), 'int_col'] = 'Black'
df_tr.loc[df_tr['int_col'].str.contains('Ebony', case=False), 'int_col'] = 'Black'
df_tr.loc[df_tr['int_col'].str.contains('Onyx', case=False), 'int_col'] = 'Black'
df_tr.loc[df_tr['int_col'].str.contains('Nero', case=False), 'int_col'] = 'Black'
df_tr.loc[df_tr['int_col'].str.contains('Noir', case=False), 'int_col'] = 'Black'
df_tr.loc[df_tr['int_col'].str.contains('Obsidian', case=False), 'int_col'] = 'Black'

In [30]:
 #Using specific values
df_tr['int_col'] = df_tr['int_col'].replace(['Iridescent Pearl Tricoat', 'Wind Chill Pearl', 'Rift Metallic', 'Glacier', 'Yulong'], 'White')
df_tr['int_col'] = df_tr['int_col'].replace(['Manhattan Noir Metallic', 'Blk'], 'Black')
df_tr['int_col'] = df_tr['int_col'].replace(['Sunset Drift Chromaflair'], 'Orange')
df_tr['int_col'] = df_tr['int_col'].replace(['Satin Steel Metallic'], 'Silver')
df_tr['int_col'] = df_tr['int_col'].replace(['Dark Ash Metallic', 'Majestic Plum Metallic'], 'Purple')
df_tr['int_col'] = df_tr['int_col'].replace(['Medium Stone', 'Sandstone Metallic'], 'Beige')
df_tr['int_col'] = df_tr['int_col'].replace(['Maroon', 'Rich Garnet Metallic', 'Ametrin Metallic', 'Caviar'], 'Red')
df_tr['int_col'] = df_tr['int_col'].replace(['Boulder', 'Dark Matter Metallic', 'Billet Clearcoat Metallic','Magnetic Metallic', 'Tungsten Metallic', 'Iridium Metallic', 'Gun Metallic', 'Designo Magno Matte', 'Dark Galvanized'], 'Gray')
df_tr['int_col'] = df_tr['int_col'].replace(['Maximum Steel Metallic', 'Ice', 'Moonlight Cloud', 'Nightfall Mica', 'Mountain Air Metallic', 'Lunar Rock', 'Stormy Sea', 'Tension'], 'Blue')
df_tr['int_col'] = df_tr['int_col'].replace(['C / C', '–', 'Metallic', 'Custom Color', 'Sport'], 'Other')

In [31]:
#Identify categorical columns
obj_y_n = (df_tr.dtypes == 'object')
obj_cols = list(obj_y_n[obj_y_n].index)
obj_cols

['brand', 'fuel_type', 'ext_col', 'int_col', 'accident', 'clean_title', 'trx']

In [32]:
encoder = OneHotEncoder(sparse_output=False, dtype=int)

cats_enc = pd.DataFrame(encoder.fit_transform(df_tr[obj_cols]), columns=encoder.get_feature_names_out())

In [33]:
#Combine original and one hot feature dataframes
df_tr_enc = pd.concat([df_tr, cats_enc], axis=1)
 #Drop pre-transformation features
df_tr_enc = df_tr_enc.drop(df_tr_enc[obj_cols], axis=1)
 #Move Label to last column
df_tr_enc['price'] = df_tr_enc.pop('price')

In [34]:
print(df_tr_enc.shape)
df_tr_enc.head(1)

(58282, 106)


Unnamed: 0,model_year,milage,hp,liters,cylinders,speeds,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,brand_Chrysler,brand_Dodge,brand_FIAT,brand_Ferrari,brand_Ford,brand_GMC,brand_Genesis,brand_Honda,brand_Hummer,brand_Hyundai,brand_INFINITI,brand_Jaguar,brand_Jeep,brand_Karma,brand_Kia,brand_Lamborghini,brand_Land,brand_Lexus,brand_Lincoln,brand_Lotus,brand_Lucid,brand_MINI,brand_Maserati,brand_Maybach,brand_Mazda,brand_McLaren,brand_Mercedes-Benz,brand_Mercury,brand_Mitsubishi,brand_Nissan,brand_Plymouth,brand_Polestar,brand_Pontiac,brand_Porsche,brand_RAM,brand_Rivian,brand_Rolls-Royce,brand_Saab,brand_Saturn,brand_Scion,brand_Subaru,brand_Suzuki,brand_Tesla,brand_Toyota,brand_Volkswagen,brand_Volvo,brand_smart,fuel_type_Diesel,fuel_type_E85 Flex Fuel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,ext_col_Beige,ext_col_Black,ext_col_Blue,ext_col_Bronze,ext_col_Brown,ext_col_Gold,ext_col_Gray,ext_col_Green,ext_col_Orange,ext_col_Other,ext_col_Pink,ext_col_Purple,ext_col_Red,ext_col_Silver,ext_col_White,ext_col_Yellow,int_col_Beige,int_col_Black,int_col_Blue,int_col_Brown,int_col_Gold,int_col_Gray,int_col_Green,int_col_Orange,int_col_Other,int_col_Red,int_col_Silver,int_col_White,int_col_Yellow,accident_At least 1 accident or damage reported,accident_None reported,clean_title_No,clean_title_Yes,trx_Automatic,trx_Both,trx_Electric,trx_Manual,price
0,2018,74349,375.0,3.5,6.0,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,11000


In [35]:
#Set random seed
r_state = 42
#Set tree number for all models
num_trees = 400

scaler = StandardScaler()

#Split into features and target
y = df_tr_enc['price']
X = df_tr_enc.drop(columns=['price'])

#Scale (prior to split)
X_sc = scaler.fit_transform(X)

#Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.1, random_state=r_state)

In [36]:
#Evaluate RandomForest model using tuned hyperparameter values

rf_model = RandomForestRegressor(n_estimators=num_trees, random_state=r_state, max_depth=7, max_features=0.4, n_jobs=-1)

rf_model.fit(X_train, y_train)

y_pred_tr = rf_model.predict(X_test)

print(rf_model.score(X_test, y_test))

#Evaluate model MSE
mse = root_mean_squared_error(y_test, y_pred_tr)
print("Root Mean Squared Error:", mse)

0.2809318490593957
Root Mean Squared Error: 56722.58813949618
