In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
data = pd.read_csv('autos.csv')

In [4]:
def outlier(value):
    Q1 = np.percentile(data[value], 25)
    Q3 = np.percentile(data[value], 75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR

    data[data[value] > upper] = np.nan
    data[data[value] < lower] = np.nan

In [5]:
for i in data.select_dtypes(exclude=object).columns:
    outlier(i)

In [6]:
data.select_dtypes(exclude=object).fillna(data.select_dtypes(exclude=object).mean(), inplace=True)

In [7]:
data.dropna(inplace=True)

In [8]:
data['notRepairedDamage'] = pd.factorize(data['notRepairedDamage'])[0]
data['seller'] = pd.factorize(data['seller'])[0]
data['offerType'] = pd.factorize(data['offerType'])[0]
data['abtest'] = pd.factorize(data['abtest'])[0]
data['gearbox'] = pd.factorize(data['gearbox'])[0]

In [9]:
# assigning dummy variables to categorical variables
data_dummy = pd.get_dummies(data, prefix='vehicleType', prefix_sep='.', 
                            columns=['vehicleType'])
data_dummy = pd.get_dummies(data_dummy, prefix='fuelType', prefix_sep='.', 
                            columns=['fuelType'], drop_first=True)

# dropping unnecessary columns
data_dummy.drop(columns=['index', 'name','dateCrawled', 'dateCreated', 'nrOfPictures', 'lastSeen', 'monthOfRegistration', 'postalCode'], inplace=True)

In [10]:
# label encoding
le = dict()
cols = data_dummy.select_dtypes(object)
for i in cols:
    temp = LabelEncoder()
    data_dummy[i] = temp.fit_transform(data_dummy[i])
    le.update({i:temp})

In [11]:
temp = abs(data_dummy.corr()['price']).sort_values(ascending=False)

In [12]:
data_dummy

Unnamed: 0,seller,offerType,price,abtest,yearOfRegistration,gearbox,powerPS,model,kilometer,brand,...,vehicleType.kleinwagen,vehicleType.kombi,vehicleType.limousine,vehicleType.suv,fuelType.benzin,fuelType.cng,fuelType.diesel,fuelType.elektro,fuelType.hybrid,fuelType.lpg
3,0,0,1500.0,0,2001.0,0,75.0,117,150000.0,37,...,1,0,0,0,1,0,0,0,0,0
4,0,0,3600.0,0,2008.0,0,69.0,102,90000.0,31,...,1,0,0,0,0,0,1,0,0,0
5,0,0,650.0,0,1995.0,0,102.0,11,150000.0,2,...,0,0,1,0,1,0,0,0,0,0
6,0,0,2200.0,0,2004.0,0,109.0,8,150000.0,25,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0.0,0,1980.0,0,50.0,40,40000.0,37,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371518,0,0,3999.0,0,2005.0,0,3.0,11,150000.0,2,...,0,1,0,0,0,0,1,0,0,0
371520,0,0,3200.0,1,2004.0,0,225.0,141,150000.0,30,...,0,0,1,0,1,0,0,0,0,0
371521,0,0,1150.0,1,2000.0,0,0.0,248,150000.0,24,...,0,0,0,0,1,0,0,0,0,0
371524,0,0,1199.0,0,2000.0,1,101.0,107,125000.0,32,...,0,0,0,0,1,0,0,0,0,0


In [13]:
data_dummy.to_csv('cleaned_used_cars_v3.csv', index=False)

In [14]:
data_dummy.columns

Index(['seller', 'offerType', 'price', 'abtest', 'yearOfRegistration',
       'gearbox', 'powerPS', 'model', 'kilometer', 'brand',
       'notRepairedDamage', 'vehicleType.andere', 'vehicleType.bus',
       'vehicleType.cabrio', 'vehicleType.coupe', 'vehicleType.kleinwagen',
       'vehicleType.kombi', 'vehicleType.limousine', 'vehicleType.suv',
       'fuelType.benzin', 'fuelType.cng', 'fuelType.diesel',
       'fuelType.elektro', 'fuelType.hybrid', 'fuelType.lpg'],
      dtype='object')

In [15]:
temp

price                     1.000000
yearOfRegistration        0.553657
kilometer                 0.357906
fuelType.diesel           0.283121
fuelType.benzin           0.275701
vehicleType.kleinwagen    0.235073
notRepairedDamage         0.234081
gearbox                   0.219876
powerPS                   0.180794
vehicleType.suv           0.166247
vehicleType.cabrio        0.116403
brand                     0.103085
vehicleType.bus           0.057991
vehicleType.coupe         0.046505
vehicleType.kombi         0.037942
fuelType.hybrid           0.028711
vehicleType.andere        0.021853
model                     0.020548
fuelType.lpg              0.016585
fuelType.elektro          0.007490
vehicleType.limousine     0.004486
offerType                 0.004333
abtest                    0.002464
fuelType.cng              0.000990
seller                    0.000435
Name: price, dtype: float64