# Regression of Used Car Prices

The goal of this project is to predict the price of used cars based on various attributes.

In [None]:
!pip install category_encoders
import category_encoders as ce
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import re

In [48]:
#Reading the data
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [49]:
#The number of unique values in each column
train.nunique()

id              188533
brand               57
model             1897
model_year          34
milage            6651
fuel_type            7
engine            1117
transmission        52
ext_col            319
int_col            156
accident             2
clean_title          1
price             1569
dtype: int64

In [50]:
#Due to high number of categories, one-hot encoding and label-encoding are not ideal for brand, model, and transmission. 
#Target encoding for brand, model, and transmission
target_encoder = ce.TargetEncoder(cols=['brand', 'model', 'transmission'], smoothing=0.2)

#Fit the encoder on the train DataFrame
encoded_train = target_encoder.fit_transform(train[['brand', 'model', 'transmission']], train['price'])
train['encoded_brand'] = encoded_train['brand']
train['encoded_model'] = encoded_train['model']
train['encoded_transmission'] = encoded_train['transmission']

# Assign the encoded values back to the test DataFrame
encoded_test = target_encoder.transform(test[['brand','model', 'transmission']])
test['encoded_brand'] = encoded_test['brand']
test['encoded_model'] = encoded_test['model']
test['encoded_transmission'] = encoded_test['transmission']

In [51]:
#Description of engine had two components: engine power in horsepower and engine size in liters.
#They can be extracted from the description using regular expression.   

def extract_horsepower(description):
    # Define a regex pattern to match horsepower values
    pattern = r'(\d+(?:\.\d{1,2})?)\s*(hp|horsepower|HP)'
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return float(match.group(1))
    else:
        return None
train['engine_horsepower'] = train['engine'].apply(extract_horsepower)
test['engine_horsepower'] = test['engine'].apply(extract_horsepower)

#Fill null values with mean
means=train['engine_horsepower'].mean()
train['engine_horsepower']=train['engine_horsepower'].fillna(means)
test['engine_horsepower']=test['engine_horsepower'].fillna(means)

In [52]:
def extract_engine_size(description):
    # Regex pattern for engine size in liters
    pattern = r'(\d+(?:\.\d{1,2})?)\s*L'
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return float(match.group(1))  # Return engine size as float
    return None
train['engine_liters'] = train['engine'].apply(extract_engine_size)
test['engine_liters'] = test['engine'].apply(extract_engine_size)

#Fill null values with mean
means=train['engine_liters'].mean()
train['engine_liters']=train['engine_liters'].fillna(means)
test['engine_liters']=test['engine_liters'].fillna(means)

In [53]:
#Label encoding for accident
categorical_columns = [ 'accident'  ]
unknown_category = -1
# Encode categorical columns
for col in categorical_columns:
    le = LabelEncoder()    
    train[col] = le.fit_transform(train[col])    
    test[col] = test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else unknown_category)

In [54]:
#Defining inputs and output
features= ['model_year','milage','accident','encoded_brand', 'encoded_model','encoded_transmission','engine_horsepower', 'engine_liters']
X_train=train[features]
X_test= test[features]
y_train= train['price']
X1, X2, y1, y2 = train_test_split(X_train, y_train, test_size=0.2,stratify=y_train)


In [55]:
X_train.head()

Unnamed: 0,model_year,milage,accident,encoded_brand,encoded_model,encoded_transmission,engine_horsepower,engine_liters
0,2007,213000,1,17162.015977,12832.716263,31711.97914,172.0,1.6
1,2002,143250,0,30712.59513,12663.37931,31711.97914,252.0,3.9
2,2002,136731,1,41836.755127,23779.677419,31711.97914,320.0,5.3
3,2017,19500,1,53556.327141,48915.842105,50201.81558,420.0,5.0
4,2021,7388,1,51244.399541,64896.257198,47123.507371,208.0,2.0


In [56]:
X_test.head()

Unnamed: 0,model_year,milage,accident,encoded_brand,encoded_model,encoded_transmission,engine_horsepower,engine_liters
0,2015,98000,1,53204.88,19582.38,25864.659998,240.0,2.0
1,2020,9142,1,53204.88,74242.182266,56442.748656,395.0,3.0
2,2022,28121,1,40511.969508,50420.818345,63990.058786,343.263856,3.5
3,2016,61258,1,41004.856985,28007.0625,59588.575344,343.263856,2.0
4,2018,59000,1,41004.856985,24723.991489,31711.97914,252.0,2.0


In [57]:
#Training the model
model= RandomForestRegressor(n_estimators=80, max_depth=8,  min_samples_leaf=7,  min_samples_split=10,  max_features='sqrt', n_jobs=-1)
model.fit(X1, y1)
train_score=model.score(X1, y1)
val_score=model.score(X2, y2)
print(train_score)
print(val_score)

0.18709448903708503
0.15114262964373792


In [58]:
#Calculate RMSE
y_pred1 = model.predict(X1)
y_pred2 = model.predict(X2)
train_mse = mean_squared_error(y1, y_pred1)
val_mse = mean_squared_error(y2, y_pred2)
train_rmse = np.sqrt(train_mse)
val_rmse = np.sqrt(val_mse)
print(train_rmse)
print(val_rmse)

71134.60028883479
72331.68966660161


In [59]:
#Analyzing feature importances of the model
importances = model.feature_importances_
features_df = pd.DataFrame({
    'Feature': X1.columns,
    'Importance': importances
})
features_df = features_df.sort_values(by='Importance', ascending=False)
print(features_df)

                Feature  Importance
1                milage    0.294298
4         encoded_model    0.267601
0            model_year    0.135951
6     engine_horsepower    0.094187
5  encoded_transmission    0.082760
3         encoded_brand    0.075692
7         engine_liters    0.039113
2              accident    0.010397


In [60]:
#Calculating cross-validation
model= RandomForestRegressor(n_estimators=80, max_depth=8,  min_samples_leaf=7,  min_samples_split=10,  max_features='sqrt', n_jobs=-1)
scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
# Print the RMSE scores for each fold
print("Cross-validation RMSE scores:", rmse_scores)
# Print the mean and standard deviation of the RMSE scores
print(f'Mean RMSE: {np.mean(rmse_scores):.2f}')
print(f'Standard Deviation: {np.std(rmse_scores):.2f}')

Cross-validation RMSE scores: [73172.92520226 73705.87647782 71063.80516201]
Mean RMSE: 72647.54
Standard Deviation: 1140.81


In [61]:
#Hyperparameter tuning using Grid Search CV
param_grid = {
    'min_samples_split':[5,10,15],
    'max_depth': [10],
    'min_samples_leaf':[5,10,15]
}
model=  RandomForestRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 15, 'min_samples_split': 10}
Best Score: -5290097067.274835


In [62]:
# Generating the model output
model=grid_search.best_estimator_
predictions= model.predict(X_test)
submissions= test[['id']]
submissions['price']= predictions
submissions.to_csv('submission.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions['price']= predictions
