# Used car sales machine learning

## Load in data and split into train/val/test sets

In [3]:
##Load in cleaned data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

cars = pd.read_csv('cleaned_vehicles.csv')

print("Number of features: " +str(len(cars.columns)))
print("Number of rows: "+str(len(cars)))
print("\n")

##Shuffle dataset
cars = cars.sample(frac=1, random_state=42)

##Split into train, test and validation (60%, 20%, 20%)
train, validation, test = np.split(cars,  [int(.6*len(cars)), int(.8*len(cars))])

print("Train size = "+str(len(train)))
print("Validation size = "+str(len(validation)))
print("Test size = "+str(len(test)))                       

Number of features: 544
Number of rows: 316984


Train size = 190190
Validation size = 63397
Test size = 63397


In [15]:
##Split target variable from inputs
trainX = train.drop(['price'], axis=1)
trainY = train['price']

validX = validation.drop(['price'], axis=1)
validY = validation['price']

testX = test.drop(['price'], axis=1)
testY = test['price']


## Try many quick and dirty models

### Linear Regression (with different regularizers)

In [17]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error


##Normal linear regression
reg = linear_model.LinearRegression()
reg.fit(trainX, trainY)
trainPredict = reg.predict(trainX)
validPredict = reg.predict(validX)
testPredict = reg.predict(testX)

print("#####Normal Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')

##Linear regression with L1 (LASSO) regulariser
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(trainX, trainY)
trainPredict = lasso.predict(trainX)
validPredict = lasso.predict(validX)
testPredict = lasso.predict(testX)

print("#####LASSO Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')


##Linear regression with L2 (RIDGE) regulariser
ridge = linear_model.Ridge(alpha=.5)
ridge.fit(trainX, trainY)
trainPredict = ridge.predict(trainX)
validPredict = ridge.predict(validX)
testPredict = ridge.predict(testX)

print("#####RIDGE Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')



#####Normal Linear Regression######
Train MSE: 0.004796721613369497
Validation MSE: 2.7195634889375144e+16
Test MSE: 0.00468055822057678


#####LASSO Linear Regression######
Train MSE: 0.01141967740157115
Validation MSE: 0.01138012881093592
Test MSE: 0.011502384297039293


#####RIDGE Linear Regression######
Train MSE: 0.004796191360930326
Validation MSE: 0.004755812989821801
Test MSE: 0.004680225184993724




In [5]:
##Naive Bayes

In [6]:
##SVM/support vector regression

In [20]:
##neural net

### Bagging (Random Forests)

In [18]:
from sklearn.ensemble import RandomForestRegressor

randForestReg = RandomForestRegressor(max_depth=2, random_state=0)
randForestReg.fit(trainX, trainY)
trainPredict = randForestReg.predict(trainX)
validPredict = randForestReg.predict(validX)
testPredict = randForestReg.predict(testX)

print("#####Bagging (Random Forests) ######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')


#####Baggin (Random Forests) ######
Train MSE: 0.006740794348262127
Validation MSE: 0.00675701825125138
Test MSE: 0.0067154156759958936




### Boosting (Adaboost and XGBoost)

In [23]:
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

ada = AdaBoostRegressor()
ada.fit(trainX, trainY)
trainPredict = ada.predict(trainX)
validPredict = ada.predict(validX)
testPredict = ada.predict(testX)

print("#####Boosting (Adaboost) ######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')


xgboost = XGBRegressor()
xgboost.fit(trainX, trainY)
trainPredict = xgboost.predict(trainX)
validPredict = xgboost.predict(validX)
testPredict = xgboost.predict(testX)

print("#####Boosting (XGBoost) ######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print('\n')


#####Boosting (Adaboost) ######
Train MSE: 0.008977221213093786
Validation MSE: 0.008998580402742458
Test MSE: 0.008964786268583625






#####Boosting (XGBoost) ######
Train MSE: 0.0024938097377222683
Validation MSE: 0.0028280594495580795
Test MSE: 0.0027646131095732675


