In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

# Display preferences
%matplotlib inline
pd.set_option('display.max_columns', 100)

In [2]:
# Create dataframe
cars = pd.read_csv('../data/cars.csv')

In [3]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
manufacturer_name    38531 non-null object
model_name           38531 non-null object
transmission         38531 non-null object
color                38531 non-null object
odometer_value       38531 non-null int64
year_produced        38531 non-null int64
engine_fuel          38531 non-null object
engine_has_gas       38531 non-null bool
engine_type          38531 non-null object
engine_capacity      38521 non-null float64
body_type            38531 non-null object
has_warranty         38531 non-null bool
state                38531 non-null object
drivetrain           38531 non-null object
price_usd            38531 non-null float64
is_exchangeable      38531 non-null bool
location_region      38531 non-null object
number_of_photos     38531 non-null int64
up_counter           38531 non-null int64
feature_0            38531 non-null bool
feature_1            38531 non-null bool


In [4]:
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,universal,False,owned,all,10900.0,False,Минская обл.,9,13,False,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,universal,False,owned,all,5000.0,True,Минская обл.,12,54,False,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,suv,False,owned,all,2800.0,True,Минская обл.,4,72,False,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,sedan,False,owned,all,9999.0,True,Минская обл.,9,42,True,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,universal,False,owned,all,2134.11,True,Гомельская обл.,14,7,False,True,False,True,True,False,False,False,False,True,7


In [5]:
cars.describe()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
count,38531.0,38531.0,38521.0,38531.0,38531.0,38531.0,38531.0
mean,248864.638447,2002.943734,2.055161,6639.971021,9.649062,16.306091,80.577249
std,136072.37653,8.065731,0.671178,6428.152018,6.093217,43.286933,112.826569
min,0.0,1942.0,0.2,1.0,1.0,1.0,0.0
25%,158000.0,1998.0,1.6,2100.0,5.0,2.0,23.0
50%,250000.0,2003.0,2.0,4800.0,8.0,5.0,59.0
75%,325000.0,2009.0,2.3,8990.0,12.0,16.0,91.0
max,1000000.0,2019.0,8.0,50000.0,86.0,1861.0,2232.0


In [6]:
# Identify columns with missing values
missing_values = []

for column in cars.columns:
    if cars[column].isnull().sum() > 0:
        missing_values.append(column)
        
print('Independent variable(s) with missing values: {}'.format(missing_values))

Independent variable(s) with missing values: ['engine_capacity']


In [7]:
# Imputation
cars['engine_capacity'].fillna(cars['engine_capacity'].mean(), inplace=True)

In [8]:
# Identify categorical variables and unique values
categorical_variables = []

for column in cars.columns:
    if cars[column].dtype == 'object':
        categorical_variables.append(column)
        
for variable in categorical_variables:
    print('Variable name is {}'.format(variable))
    print('Number of unique values {}'.format(cars[variable].nunique()))
    print(cars[variable].unique())
    print('--------------------------')

Variable name is manufacturer_name
Number of unique values 55
['Subaru' 'LADA' 'Dodge' 'УАЗ' 'Kia' 'Opel' 'Москвич' 'Alfa Romeo' 'Acura'
 'Dacia' 'Lexus' 'Mitsubishi' 'Lancia' 'Citroen' 'Mini' 'Jaguar' 'Porsche'
 'SsangYong' 'Daewoo' 'Geely' 'ВАЗ' 'Fiat' 'Ford' 'Renault' 'Seat' 'Rover'
 'Volkswagen' 'Lifan' 'Jeep' 'Cadillac' 'Audi' 'ЗАЗ' 'Toyota' 'ГАЗ'
 'Volvo' 'Chevrolet' 'Great Wall' 'Buick' 'Pontiac' 'Lincoln' 'Hyundai'
 'Nissan' 'Suzuki' 'BMW' 'Mazda' 'Land Rover' 'Iveco' 'Skoda' 'Saab'
 'Infiniti' 'Chery' 'Honda' 'Mercedes-Benz' 'Peugeot' 'Chrysler']
--------------------------
Variable name is model_name
Number of unique values 1118
['Outback' 'Forester' 'Impreza' ... '180' 'Vision' 'Aspen']
--------------------------
Variable name is transmission
Number of unique values 2
['automatic' 'mechanical']
--------------------------
Variable name is color
Number of unique values 12
['silver' 'blue' 'red' 'black' 'grey' 'other' 'brown' 'white' 'green'
 'violet' 'orange' 'yellow']
--------

In [9]:
# Categorical variables one-hot encoding
cars = pd.concat([cars, pd.get_dummies(cars['manufacturer_name'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['transmission'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['color'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['engine_fuel'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['engine_type'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['body_type'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['state'])], axis=1)
cars = pd.concat([cars, pd.get_dummies(cars['drivetrain'])], axis=1)

In [10]:
# Remove categorical variables
cars = cars.drop(categorical_variables, axis=1)

In [11]:
# Create independent and dependent variables
X = cars.drop('price_usd', axis=1)
Y = cars['price_usd']

In [12]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=465)

In [16]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# Linear Regression cross validation
scores = cross_val_score(lr, X_train, y_train, cv=5)
scores.mean()

0.7526070773216063

In [30]:
# KNN Regression
knn = KNeighborsRegressor(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='distance')

In [31]:
# KNN Regression cross validation
scores = cross_val_score(knn, X_train, y_train, cv=5)
scores.mean()

0.3459824115094798

In [32]:
# Predictions
lr_preds = lr.predict(X_test)
knn_preds = knn.predict(X_test)

In [33]:
# Performance / Evaluation metrics
print('Linear Regression')
print('R-squared: {}'.format(lr.score(X_test, y_test)))
print('Mean absolute error: {}'.format(mean_absolute_error(y_test, lr_preds)))
print('Mean squared error: {}'.format(mse(y_test, lr_preds)))
print('Root mean squared error: {}'.format(rmse(y_test, lr_preds)))

print('KNN Regression')
print('R-squared: {}'.format(knn.score(X_test, y_test)))
print('Mean absolute error: {}'.format(mean_absolute_error(y_test, lr_preds)))
print('Mean squared error: {}'.format(mse(y_test, knn_preds)))
print('Root mean squared error: {}'.format(rmse(y_test, knn_preds)))

Linear Regression
R-squared: 0.7614739988287282
Mean absolute error: 1973.0605528110823
Mean squared error: 10006142.687009063
Root mean squared error: 3163.2487551580675
KNN Regression
R-squared: 0.382045283752668
Mean absolute error: 1973.0605528110823
Mean squared error: 25923140.599003702
Root mean squared error: 5091.477251152528


### Conclusion

On the training set, the R-squared value for Linear Regression was roughly 0.7526 while the value for KNN Regression was 0.3459.

On the test set, the R-squared value for Linear Regression was roughly 0.7614 while the value for KNN Regression was 0.3820. The mean absolute error was the same value for both regression models, but the mean squared error and root mean squared error where lower for the Linear Regression model.

Based on the performance / evaluation metrics, I prefer the Linear Regression model over the KNN Regression model.