In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import plotly.graph_objects as go

In [2]:
data_audi = pd.read_csv('D:/Data/Car Price/audi.csv')
data_bmw = pd.read_csv('D:/Data/Car Price/bmw.csv')
data_ford = pd.read_csv('D:/Data/Car Price/ford.csv')
data_hyundi = pd.read_csv('D:/Data/Car Price/hyundi.csv').rename(columns= {'tax(£)': 'tax'})
data_merc = pd.read_csv('D:/Data/Car Price/merc.csv')
data_skoda = pd.read_csv('D:/Data/Car Price/skoda.csv')
data_toyota = pd.read_csv('D:/Data/Car Price/toyota.csv')

In [3]:
all_data = {'audi': data_audi, 'bmw': data_bmw, 'ford': data_ford, 'hyundi': data_hyundi, 'merc': data_merc, 'skoda': data_skoda, 'toyota': data_toyota}

In [4]:
# Add new feature "brand"
for key, dataset in all_data.items():
    dataset['brand'] = key

# Merge all dataframes into one
data_temp = pd.DataFrame(columns=data_audi.columns)

for key, df in all_data.items():
    data_temp = data_temp.append(df, ignore_index=True)

# Shuffle final dataframe
data = data_temp.sample(frac=1).reset_index(drop=True)

In [5]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A3,2013,8299,Manual,51535,Diesel,20,68.9,1.6,audi
1,Focus,2016,11995,Automatic,12320,Petrol,125,51.4,1.0,ford
2,Kodiaq,2019,26394,Automatic,3409,Petrol,145,32.5,1.5,skoda
3,Fiesta,2014,5999,Manual,59526,Petrol,0,65.7,1.0,ford
4,X5,2020,44875,Semi-Auto,2000,Diesel,145,36.7,3.0,bmw
...,...,...,...,...,...,...,...,...,...,...
70393,Aygo,2017,6798,Manual,13036,Petrol,0,69.0,1.0,toyota
70394,Fiesta,2007,1400,Manual,63000,Petrol,145,47.1,1.2,ford
70395,GLC Class,2019,44499,Semi-Auto,10000,Petrol,145,34.0,3.0,merc
70396,Kuga,2017,15995,Manual,15658,Petrol,160,45.6,1.5,ford


In [6]:
data.describe()

Unnamed: 0,mpg,engineSize
count,70398.0,70398.0
mean,56.174424,1.724401
std,17.421733,0.605706
min,1.1,0.0
25%,47.9,1.2
50%,56.5,1.6
75%,64.2,2.0
max,470.8,6.6


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70398 entries, 0 to 70397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         70398 non-null  object 
 1   year          70398 non-null  object 
 2   price         70398 non-null  object 
 3   transmission  70398 non-null  object 
 4   mileage       70398 non-null  object 
 5   fuelType      70398 non-null  object 
 6   tax           70398 non-null  object 
 7   mpg           70398 non-null  float64
 8   engineSize    70398 non-null  float64
 9   brand         70398 non-null  object 
dtypes: float64(2), object(8)
memory usage: 5.4+ MB


# Data cleaning

## Outliers

In [8]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['price'],
    marker_color='rgb(9,56,125)',
    name = 'Price'
))

fig.update_layout(
    title_text = 'Price box plot',
    template = 'plotly_dark'
)

There are many points that looks like outliers but for me it is completely normal that some cars are very expensive. There is no need to delete these records.

In [9]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mileage'],
    marker_color='rgb(128, 0, 0)',
    name = 'Mileage'
))

fig.update_layout(
    title_text = 'Mileage box plot',
    template = 'plotly_dark'
)

In [10]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['tax'],
    marker_color='rgb(0, 153, 51)',
    name = 'Tax'
))

fig.update_layout(
    title_text = 'Tax box plot',
    template = 'plotly_dark'
)

In [11]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mpg'],
    marker_color='rgb(153, 92, 0)',
    name = 'Miles per Gallon'
))

fig.update_layout(
    title_text = 'Miles per Gallon box plot',
    template = 'plotly_dark'
)

Tax and mileage looks strange but it is still something I can believe in but outliers from mpg are completely unrealistic. I am going to delete car with 470 mpg from our dataset. 

In [12]:
data = data[data['mpg'] < 470]

In [13]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mpg'],
    marker_color='rgb(153, 92, 0)',
    name = 'Miles per Gallon'
))

fig.update_layout(
    title_text = 'Miles per Gallon box plot',
    template = 'plotly_dark'
)

## Columns usefulnes
We are going to replace very rare models with 'Rare' value to reduce number of new dimensions during dummy variables creation and to make dataset a little cleaner.

In [14]:
to_replace = data['model'].value_counts()[data['model'].value_counts().values < 10]
to_replace

 IQ                 8
 M6                 8
 SQ7                8
 Z3                 7
 CLK                7
 Getz               6
 S8                 4
 Urban Cruiser      4
 Verso-S            3
 S5                 3
 Veloster           3
 CLC Class          3
 Terracan           2
 R Class            2
 Streetka           2
200                 1
 Accent             1
 Amica              1
230                 1
 RS7                1
 Transit Tourneo    1
180                 1
 Ranger             1
 Escort             1
 A2                 1
220                 1
Name: model, dtype: int64

In [15]:
indexes = data.index[data['model'].isin(to_replace.index)]
data.loc[indexes, 'model'] = ' Rare'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Single variable analysis

In [16]:
fig = go.Figure()

to_plot = data['year'].value_counts()

fig.add_trace(go.Bar(
    x = to_plot.index,
    y = to_plot.values
))

fig.update_layout(
    title_text = 'Number of cars from every year',
    template = 'plotly_dark',
    xaxis_title = 'Year',
    yaxis_title = 'Number of cars'
)

In [17]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['mileage'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of mileage feature',
    template = 'plotly_dark',
    xaxis_title = 'Mileage'
)

In [18]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['price'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of price feature',
    template = 'plotly_dark',
    xaxis_title = 'Price'
)

In [19]:
fig = go.Figure()

night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)',
                'rgb(36, 55, 57)', 'rgb(6, 4, 4)']

to_plot = data['transmission'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent',
    marker_colors = night_colors
))

fig.update_layout(
    title_text = 'Transmission Pie Plot',
    template = 'plotly_dark'
)

In [20]:
fig = go.Figure()

to_plot = data['brand'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent'
))

fig.update_layout(
    title_text = 'Brand Pie Plot',
    template = 'plotly_dark'
)

In [21]:
fig = go.Figure()

to_plot = data['fuelType'].value_counts()

fig.add_trace(go.Bar(
    y = to_plot.index,
    x = to_plot.values,
    orientation = 'h',
    text = to_plot.values,
    textposition = 'outside',
))

fig.update_layout(
    autosize = False,
    width = 850,
    template = 'plotly_dark',
    title_text = 'Fuel Type Bar Plot'
)

# Multiple variables analysis

In [22]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A3,2013,8299,Manual,51535,Diesel,20,68.9,1.6,audi
1,Focus,2016,11995,Automatic,12320,Petrol,125,51.4,1.0,ford
2,Kodiaq,2019,26394,Automatic,3409,Petrol,145,32.5,1.5,skoda
3,Fiesta,2014,5999,Manual,59526,Petrol,0,65.7,1.0,ford
4,X5,2020,44875,Semi-Auto,2000,Diesel,145,36.7,3.0,bmw
...,...,...,...,...,...,...,...,...,...,...
70393,Aygo,2017,6798,Manual,13036,Petrol,0,69.0,1.0,toyota
70394,Fiesta,2007,1400,Manual,63000,Petrol,145,47.1,1.2,ford
70395,GLC Class,2019,44499,Semi-Auto,10000,Petrol,145,34.0,3.0,merc
70396,Kuga,2017,15995,Manual,15658,Petrol,160,45.6,1.5,ford


### Pearson
<b>The Pearson correlation coefficient<b> is a measure of the linear relationship between two features. It's the ratio of the covariance of x and y to the product of their standard deviations. It's often denoted with the letter r and called Pearson's r.

### Spearman
The Spearman correlation coefficient between two features is the Pearson correlation coefficient between their rank values. It's calculated the same ways as the Pearson correlation coefficient but takes into account their ranks instead of their values.

### Kendall
The Kendall correlation coefficient compares the number of concordant and discordant pairs of data. This coefficient is basen on the differce in the counts of concordant and discordant pairs relative to the number of x-y paris.

concordant if either (xᵢ > xⱼ and yᵢ > yⱼ) or (xᵢ < xⱼ and yᵢ < yⱼ) <br>
discordant if either (xᵢ < xⱼ and yᵢ > yⱼ) or (xᵢ > xⱼ and yᵢ < yⱼ) <br>
neither if there’s a tie in x (xᵢ = xⱼ) or a tie in y (yᵢ = yⱼ)

In [23]:
def correlation(x, y):
    """
    x, y - two lists of features
    This function prints Pearson, Spearman
    """

    # Pearson's r
    pearson_r = np.corrcoef(x, y)[0, 1]

    # Spearman's rho
    spearman_rho = stats.spearmanr(x, y).correlation

    # Kendall's tau
    kendall_tau = stats.kendalltau(x, y).correlation

    print('Pearson correlation coefficient: {}'.format(pearson_r))
    print('Spearman correlation coefficient: {}'.format(spearman_rho))
    print('Kendall correlation coefficient: {}'.format(kendall_tau))

In [24]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mileage'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Mileage x Price Scatter Plot',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Mileage'
)

In [25]:
correlation(data['price'].tolist(), data['mileage'].tolist())

Pearson correlation coefficient: -0.4234689402174707
Spearman correlation coefficient: -0.5195038477197083
Kendall correlation coefficient: -0.35982127907941397


In [26]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mpg'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Miles per gallon',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Miles per gallon'
)

In [27]:
correlation(data['price'].tolist(), data['mpg'].tolist())

Pearson correlation coefficient: -0.39987651646836303
Spearman correlation coefficient: -0.45925542348452225
Kendall correlation coefficient: -0.32218310875801026


In [28]:
data_price_float = data.copy()
data_price_float['price'] = data_price_float['price'].map(lambda x: float(x))

In [29]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='transmission', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['transmission'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by transmission type',
    template = 'plotly_dark'
)

In [30]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='fuelType', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['fuelType'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by fuel type',
    template = 'plotly_dark'
)

In [31]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='brand', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['brand'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by brand',
    template = 'plotly_dark'
)

# Machine Learning

In [39]:
X = data.drop(['price'], axis=1)
y = data['price']

## Dummy variables

In [40]:
# Create dummy variables
columns = ['model', 'transmission', 'fuelType', 'brand']

dummy = pd.get_dummies(X[columns], dtype=np.int32)
X_dummy = pd.concat([X, dummy], axis=1)
X_dummy.drop(columns, axis=1, inplace=True)

In [41]:
X_dummy

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_ 1 Series,model_ 2 Series,model_ 3 Series,model_ 4 Series,model_ 5 Series,...,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,brand_audi,brand_bmw,brand_ford,brand_hyundi,brand_merc,brand_skoda,brand_toyota
0,2013,51535,20,68.9,1.6,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2016,12320,125,51.4,1.0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,2019,3409,145,32.5,1.5,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,2014,59526,0,65.7,1.0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,2020,2000,145,36.7,3.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70393,2017,13036,0,69.0,1.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
70394,2007,63000,145,47.1,1.2,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
70395,2019,10000,145,34.0,3.0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
70396,2017,15658,160,45.6,1.5,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


## Normalization