In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [41]:
data = pd.read_csv('autos.csv')

In [42]:
def outlier(value):
    Q1 = np.percentile(data[value], 25)
    Q3 = np.percentile(data[value], 75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR

    data[data[value] > upper] = np.nan
    data[data[value] < lower] = np.nan

In [43]:
for i in data.select_dtypes(exclude=object).columns:
    outlier(i)

In [44]:
data.select_dtypes(exclude=object).fillna(data.select_dtypes(exclude=object).mean(), inplace=True)

In [45]:
data.dropna(inplace=True)

In [46]:
data['notRepairedDamage'] = pd.factorize(data['notRepairedDamage'])[0]
data['seller'] = pd.factorize(data['seller'])[0]
data['offerType'] = pd.factorize(data['offerType'])[0]
data['abtest'] = pd.factorize(data['abtest'])[0]
data['gearbox'] = pd.factorize(data['gearbox'])[0]

In [47]:
# assigning dummy variables to categorical variables
data_dummy = pd.get_dummies(data, prefix='vehicleType', prefix_sep='.', 
                            columns=['vehicleType'])
data_dummy = pd.get_dummies(data_dummy, prefix='fuelType', prefix_sep='.', 
                            columns=['fuelType'], drop_first=True)
data_dummy = pd.get_dummies(data_dummy, prefix='brand', prefix_sep='.', 
                            columns=['brand'], drop_first=True)
data_dummy = pd.get_dummies(data_dummy, prefix='model', prefix_sep='.', 
                            columns=['model'], drop_first=True)

data_dummy['yearOfRegistration'] = data_dummy['yearOfRegistration'].astype(int)

data_dummy['year_range'] = pd.cut(data_dummy['yearOfRegistration'], bins=range(data_dummy['yearOfRegistration'].min(), data_dummy['yearOfRegistration'].max()+6, 5), right=False)
data_dummy = pd.get_dummies(data_dummy, prefix='year_range', prefix_sep='.', columns=['year_range'], drop_first=True)

# dropping unnecessary columns
data_dummy.drop(columns=['index', 'name','dateCrawled', 'dateCreated', 'nrOfPictures', 'lastSeen', 'monthOfRegistration', 'postalCode','yearOfRegistration'], inplace=True)

In [48]:
# label encoding
# le = dict()
# cols = data_dummy.select_dtypes(object)
# for i in cols:
#     temp = LabelEncoder()
#     data_dummy[i] = temp.fit_transform(data_dummy[i])
#     le.update({i:temp})

In [49]:
data_dummy

Unnamed: 0,seller,offerType,price,abtest,gearbox,powerPS,kilometer,notRepairedDamage,vehicleType.andere,vehicleType.bus,...,"year_range.[1970, 1975)","year_range.[1975, 1980)","year_range.[1980, 1985)","year_range.[1985, 1990)","year_range.[1990, 1995)","year_range.[1995, 2000)","year_range.[2000, 2005)","year_range.[2005, 2010)","year_range.[2010, 2015)","year_range.[2015, 2020)"
3,0,0,1500.0,0,0,75.0,150000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,3600.0,0,0,69.0,90000.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,650.0,0,0,102.0,150000.0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6,0,0,2200.0,0,0,109.0,150000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,0.0,0,0,50.0,40000.0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371518,0,0,3999.0,0,0,3.0,150000.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
371520,0,0,3200.0,1,0,225.0,150000.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
371521,0,0,1150.0,1,0,0.0,150000.0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
371524,0,0,1199.0,0,1,101.0,125000.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [50]:
data_dummy.columns

Index(['seller', 'offerType', 'price', 'abtest', 'gearbox', 'powerPS',
       'kilometer', 'notRepairedDamage', 'vehicleType.andere',
       'vehicleType.bus',
       ...
       'year_range.[1970, 1975)', 'year_range.[1975, 1980)',
       'year_range.[1980, 1985)', 'year_range.[1985, 1990)',
       'year_range.[1990, 1995)', 'year_range.[1995, 2000)',
       'year_range.[2000, 2005)', 'year_range.[2005, 2010)',
       'year_range.[2010, 2015)', 'year_range.[2015, 2020)'],
      dtype='object', length=329)

In [51]:
scaler = StandardScaler()
data_dummy['powerPS'] = scaler.fit_transform(data_dummy[['powerPS']])
data_dummy['kilometer'] = scaler.fit_transform(data_dummy[['kilometer']])

In [52]:
data_dummy

Unnamed: 0,seller,offerType,price,abtest,gearbox,powerPS,kilometer,notRepairedDamage,vehicleType.andere,vehicleType.bus,...,"year_range.[1970, 1975)","year_range.[1975, 1980)","year_range.[1980, 1985)","year_range.[1985, 1990)","year_range.[1990, 1995)","year_range.[1995, 2000)","year_range.[2000, 2005)","year_range.[2005, 2010)","year_range.[2010, 2015)","year_range.[2015, 2020)"
3,0,0,1500.0,0,0,-0.300689,0.590703,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,3600.0,0,0,-0.343274,-1.085768,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,650.0,0,0,-0.109056,0.590703,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6,0,0,2200.0,0,0,-0.059373,0.590703,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,0.0,0,0,-0.478126,-2.482827,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371518,0,0,3999.0,0,0,-0.811710,0.590703,0,0,0,...,0,0,0,0,0,0,0,1,0,0
371520,0,0,3200.0,1,0,0.763938,0.590703,1,0,0,...,0,0,0,0,0,0,1,0,0,0
371521,0,0,1150.0,1,0,-0.833002,0.590703,0,0,1,...,0,0,0,0,0,0,1,0,0,0
371524,0,0,1199.0,0,1,-0.116153,-0.107827,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [53]:
data_dummy.to_csv('cleaned_used_cars_v3.csv', index=False)

In [54]:
data_dummy.columns

Index(['seller', 'offerType', 'price', 'abtest', 'gearbox', 'powerPS',
       'kilometer', 'notRepairedDamage', 'vehicleType.andere',
       'vehicleType.bus',
       ...
       'year_range.[1970, 1975)', 'year_range.[1975, 1980)',
       'year_range.[1980, 1985)', 'year_range.[1985, 1990)',
       'year_range.[1990, 1995)', 'year_range.[1995, 2000)',
       'year_range.[2000, 2005)', 'year_range.[2005, 2010)',
       'year_range.[2010, 2015)', 'year_range.[2015, 2020)'],
      dtype='object', length=329)

In [55]:
temp = abs(data_dummy.corr()['price']).sort_values(ascending=False)
temp

price                      1.000000
year_range.[2010, 2015)    0.472405
year_range.[1995, 2000)    0.397450
kilometer                  0.357906
year_range.[2005, 2010)    0.352283
                             ...   
model.s_type               0.000518
model.200                  0.000472
seller                     0.000435
model.cherokee             0.000410
model.forester             0.000178
Name: price, Length: 329, dtype: float64