In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import  OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error, mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import re

from scipy.stats import boxcox
from scipy.special import inv_boxcox

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.shape

(188533, 13)

In [4]:
test_data.shape

(125690, 12)

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [6]:
train_data.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [7]:
train_data.isnull().sum()/len(train_data)*100

id               0.000000
brand            0.000000
model            0.000000
model_year       0.000000
milage           0.000000
fuel_type        2.696080
engine           0.000000
transmission     0.000000
ext_col          0.000000
int_col          0.000000
accident         1.300568
clean_title     11.360876
price            0.000000
dtype: float64

In [8]:
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [9]:
train_data['transmission'].unique()

array(['A/T', 'Transmission w/Dual Shift Mode', '7-Speed A/T',
       '8-Speed A/T', '10-Speed Automatic', '1-Speed A/T', '6-Speed A/T',
       '10-Speed A/T', '9-Speed A/T', '8-Speed Automatic',
       '9-Speed Automatic', '5-Speed A/T', 'Automatic',
       '7-Speed Automatic with Auto-Shift', 'CVT Transmission',
       '5-Speed M/T', 'M/T', '6-Speed M/T', '6-Speed Automatic',
       '4-Speed Automatic', '7-Speed M/T', '2-Speed A/T',
       '1-Speed Automatic', 'Automatic CVT', '4-Speed A/T',
       '6-Speed Manual', 'Transmission Overdrive Switch',
       '8-Speed Automatic with Auto-Shift', '7-Speed Manual',
       '7-Speed Automatic', '9-Speed Automatic with Auto-Shift',
       '6-Speed Automatic with Auto-Shift',
       '6-Speed Electronically Controlled Automatic with O', 'F', 'CVT-F',
       '8-Speed Manual', 'Manual', '–', '2', '6 Speed At/Mt',
       '5-Speed Automatic', '2-Speed Automatic', '8-SPEED A/T', '7-Speed',
       'Variable', 'Single-Speed Fixed Gear', '8-SPEED AT',


In [10]:
train_data['fuel_type'].unique()

array(['Gasoline', 'E85 Flex Fuel', nan, 'Hybrid', 'Diesel',
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

In [11]:
test_data.head(2)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes


In [12]:
train_data['int_col'].unique()

array(['Gray', 'Beige', 'Black', '–', 'Blue', 'White', 'Red', 'Brown',
       'Dark Galvanized', 'Parchment.', 'Boulder', 'Orange',
       'Medium Earth Gray', 'Ebony', 'Canberra Beige', 'Jet Black',
       'Silver', 'Light Platinum / Jet Black', 'Macchiato/Magmagrey',
       'Gold', 'Cloud', 'Rioja Red', 'Global Black', 'Green',
       'Medium Stone', 'Navy Pier', 'Dark Ash', 'BLACK', 'Portland',
       'Sandstone', 'Canberra Beige/Black', 'Diesel Gray / Black',
       'Sarder Brown', 'Black Onyx', 'White / Brown', 'Black/Gun Metal',
       'Slate', 'Satin Black', 'Macchiato Beige/Black', 'Charcoal',
       'Black / Express Red', 'Cappuccino', 'Aragon Brown', 'Parchment',
       'Oyster W/Contrast', 'Adrenaline Red', 'Ebony.', 'Shara Beige',
       'Graystone', 'Pearl Beige', 'Nero Ade', 'Graphite',
       'Tan/Ebony/Ebony', 'Charcoal Black', 'Medium Ash Gray',
       'Ebony Black', 'Light Titanium', 'Sakhir Orange', 'Tan',
       'Rock Gray', 'Brandy', 'Carbon Black', 'Amber',
      

In [13]:
# engine feature extraction
def preprocess(df):
  df = df.copy()
  def parse_engine(engine_str,):
    hp = None
    liters = None
    # HP
    if isinstance(engine_str,str):
      hp_match = re.search(r'(\d\.?\d+)HP',engine_str)
      if hp_match:
        hp = float(hp_match.group(1))

      liter_match = re.search(r'(\d\.?\d+)L',engine_str)
      if liter_match:
        liters = float(liter_match.group(1))
    return hp,liters
  df[['engine_HP','engine_Liters']] = df['engine'].apply(lambda x: pd.Series(parse_engine(x)))
  df.drop('engine',axis = 1, inplace = True)

  # model_year to car age
  current_year = pd.to_datetime('today').year
  df['car_age'] = current_year - df['model_year']
  df.drop('model_year',axis = 1, inplace = True)

  # milage to float
  df['milage'] =pd.to_numeric(df['milage'],errors = 'coerce')
  df['milage'] = df['milage'].fillna(df['milage'].mean())

  # Replace '-' in int_col to NaN
  df['int_col'] = df['int_col'].replace('-',np.nan)

  # imputing the missing values in clean_title in train_data
  if 'clean_title' in df.columns:
    df['clean_title'] = df['clean_title'].fillna('Unknown')

  return df




In [14]:
print('Starting data Preprocessing')
train_df_processed = preprocess(train_data)
test_df_processed = preprocess(test_data)
print('preprocessing complete')

x_train = train_df_processed.drop('price',axis = 1)
y_train = train_df_processed['price']

# hold the ids of the test data
test_ids = test_data['id']
x_test = test_df_processed.copy()
# y_test = x_test['price']

# dropping the id col in train and test data
if 'id' in train_df_processed.columns:
  x_train = x_train.drop('id',axis = 1)

if 'id' in test_df_processed.columns:
  x_test = x_test.drop('id',axis = 1)


Starting data Preprocessing
preprocessing complete


In [15]:
# cataegorical and numerical columns
cat_cols = x_train.select_dtypes(include = ['O']).columns.tolist()
num_cols = x_train.select_dtypes(include = [np.number]).columns.tolist()

print(cat_cols)

print(num_cols)

['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
['milage', 'engine_HP', 'engine_Liters', 'car_age']


In [16]:
# encode the data using One Hot Encoder using handle_unknown = 'ignore'
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean'))
])
categorical_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num',numerical_transformer, num_cols),
        ('cat',categorical_transformer, cat_cols)
    ],
    remainder = 'passthrough'
)

In [19]:
# model pipeline with RandomForestRegressor
model = Pipeline(steps = [('preprocessor',preprocessor),
                          ('regressor',RandomForestRegressor(n_estimators = 20,random_state = 42, n_jobs = -1))])

In [20]:
print('Starting model Training')
model.fit(x_train,y_train)
print('Model Training finished')

print('prediction train data')
train_predictions = model.predict(x_train)
print('prediction done')

print('prediction test data')
test_predictions = model.predict(x_test)
print('prediction done')

Starting model Training


KeyboardInterrupt: 

In [None]:
# r2, mse scores of training and testing
r2_train = r2_score(y_train,train_predictions)
# r2_test = r2_score(y_test, test_predictions)
print(f'Training data - r2 score:{r2_train:.4f}')
# print(f'Testing data - r2 score:{r2_test:.4f}')

mse_train = mean_squared_error(y_train, train_predictions)
# mse_test = mean_squared_error(y_test, test_predictions)
print(f'Training mse:{mse_train:.4f}')
# print(f'Testing mse:{mse_train:.4f}')

mae_train = mean_absolute_error(y_train,train_predictions)
# mae_test = mean_absolute_error(y_test, test_predictions)
print(f'Training mae:{mae_train:.4f}')
# print(f'Test mae:{mae_test:.4f}')