In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('autos.csv')

In [3]:
def outlier(value):
    Q1 = np.percentile(data[value], 25)
    Q3 = np.percentile(data[value], 75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR

    data[data[value] > upper] = np.nan
    data[data[value] < lower] = np.nan

In [4]:
for i in ['yearOfRegistration', 'powerPS']:
    outlier(i)

In [5]:
from scipy.stats.mstats import winsorize

# # apply Winsorization with 10% trimming on both ends
data['price'] = winsorize(data['price'], limits=[0.1, 0.1])

In [6]:
data.select_dtypes(exclude=object).fillna(data.select_dtypes(exclude=object).mean(), inplace=True)

In [7]:
data.dropna(inplace=True)

In [8]:
data['notRepairedDamage'] = pd.factorize(data['notRepairedDamage'])[0]
data['seller'] = pd.factorize(data['seller'])[0]
data['offerType'] = pd.factorize(data['offerType'])[0]
data['abtest'] = pd.factorize(data['abtest'])[0]
data['gearbox'] = pd.factorize(data['gearbox'])[0]

In [9]:
# assigning dummy variables to categorical variables
data_dummy = pd.get_dummies(data, prefix='vehicleType', prefix_sep='.', 
                            columns=['vehicleType'])
data_dummy = pd.get_dummies(data_dummy, prefix='fuelType', prefix_sep='.', 
                            columns=['fuelType'], drop_first=True)
data_dummy = pd.get_dummies(data_dummy, prefix='brand', prefix_sep='.', 
                            columns=['brand'], drop_first=True)
data_dummy = pd.get_dummies(data_dummy, prefix='model', prefix_sep='.', 
                            columns=['model'], drop_first=True)

data_dummy['yearOfRegistration'] = data_dummy['yearOfRegistration'].astype(int)

data_dummy['year_range'] = pd.cut(data_dummy['yearOfRegistration'], bins=range(data_dummy['yearOfRegistration'].min(), data_dummy['yearOfRegistration'].max()+6, 5), right=False)
data_dummy = pd.get_dummies(data_dummy, prefix='year_range', prefix_sep='.', columns=['year_range'], drop_first=True)

# dropping unnecessary columns
data_dummy.drop(columns=['index', 'name','dateCrawled', 'dateCreated', 'nrOfPictures', 'lastSeen', 'monthOfRegistration', 'postalCode','yearOfRegistration'], inplace=True)

In [10]:
# label encoding
# le = dict()
# cols = data_dummy.select_dtypes(object)
# for i in cols:
#     temp = LabelEncoder()
#     data_dummy[i] = temp.fit_transform(data_dummy[i])
#     le.update({i:temp})

In [11]:
data_dummy

Unnamed: 0,seller,offerType,price,abtest,gearbox,powerPS,kilometer,notRepairedDamage,vehicleType.andere,vehicleType.bus,...,model.yeti,model.ypsilon,model.z_reihe,model.zafira,"year_range.[1991, 1996)","year_range.[1996, 2001)","year_range.[2001, 2006)","year_range.[2006, 2011)","year_range.[2011, 2016)","year_range.[2016, 2021)"
3,0,0,1500.0,0,0,75.0,150000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,3600.0,0,0,69.0,90000.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,650.0,0,0,102.0,150000.0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,2200.0,0,0,109.0,150000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10,0,0,2000.0,1,0,105.0,150000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371520,0,0,3200.0,1,0,225.0,150000.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
371521,0,0,1150.0,1,0,0.0,150000.0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
371524,0,0,1199.0,0,1,101.0,125000.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
371525,0,0,9200.0,0,0,102.0,150000.0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [12]:
data_dummy.columns

Index(['seller', 'offerType', 'price', 'abtest', 'gearbox', 'powerPS',
       'kilometer', 'notRepairedDamage', 'vehicleType.andere',
       'vehicleType.bus',
       ...
       'model.yeti', 'model.ypsilon', 'model.z_reihe', 'model.zafira',
       'year_range.[1991, 1996)', 'year_range.[1996, 2001)',
       'year_range.[2001, 2006)', 'year_range.[2006, 2011)',
       'year_range.[2011, 2016)', 'year_range.[2016, 2021)'],
      dtype='object', length=313)

In [13]:
scaler = StandardScaler()
data_dummy['powerPS'] = scaler.fit_transform(data_dummy[['powerPS']])
data_dummy['kilometer'] = scaler.fit_transform(data_dummy[['kilometer']])

In [14]:
data_dummy

Unnamed: 0,seller,offerType,price,abtest,gearbox,powerPS,kilometer,notRepairedDamage,vehicleType.andere,vehicleType.bus,...,model.yeti,model.ypsilon,model.z_reihe,model.zafira,"year_range.[1991, 1996)","year_range.[1996, 2001)","year_range.[2001, 2006)","year_range.[2006, 2011)","year_range.[2011, 2016)","year_range.[2016, 2021)"
3,0,0,1500.0,0,0,-0.353911,0.647340,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,3600.0,0,0,-0.395023,-0.867131,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,650.0,0,0,-0.168908,0.647340,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,2200.0,0,0,-0.120944,0.647340,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10,0,0,2000.0,1,0,-0.148352,0.647340,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371520,0,0,3200.0,1,0,0.673887,0.647340,1,0,0,...,0,0,0,0,0,0,1,0,0,0
371521,0,0,1150.0,1,0,-0.867810,0.647340,0,0,1,...,0,0,0,1,0,1,0,0,0,0
371524,0,0,1199.0,0,1,-0.175760,0.016310,0,0,0,...,0,0,0,0,0,1,0,0,0,0
371525,0,0,9200.0,0,0,-0.168908,0.647340,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [15]:
data_dummy.to_csv('cleaned_used_cars_v3.csv', index=False)

In [16]:
data_dummy.columns

Index(['seller', 'offerType', 'price', 'abtest', 'gearbox', 'powerPS',
       'kilometer', 'notRepairedDamage', 'vehicleType.andere',
       'vehicleType.bus',
       ...
       'model.yeti', 'model.ypsilon', 'model.z_reihe', 'model.zafira',
       'year_range.[1991, 1996)', 'year_range.[1996, 2001)',
       'year_range.[2001, 2006)', 'year_range.[2006, 2011)',
       'year_range.[2011, 2016)', 'year_range.[2016, 2021)'],
      dtype='object', length=313)

In [17]:
temp = abs(data_dummy.corr()['price']).sort_values(ascending=False)
temp

price                      1.000000
year_range.[2011, 2016)    0.544789
kilometer                  0.478344
year_range.[1996, 2001)    0.426831
year_range.[2006, 2011)    0.344143
                             ...   
model.forester             0.000609
model.mx_reihe             0.000594
model.jetta                0.000472
model.i3                   0.000382
model.lancer               0.000067
Name: price, Length: 313, dtype: float64