In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import plotly.graph_objects as go

In [2]:
data_audi = pd.read_csv('D:/Data/Car Price/audi.csv')
data_bmw = pd.read_csv('D:/Data/Car Price/bmw.csv')
data_ford = pd.read_csv('D:/Data/Car Price/ford.csv')
data_hyundi = pd.read_csv('D:/Data/Car Price/hyundi.csv').rename(columns= {'tax(£)': 'tax'})
data_merc = pd.read_csv('D:/Data/Car Price/merc.csv')
data_skoda = pd.read_csv('D:/Data/Car Price/skoda.csv')
data_toyota = pd.read_csv('D:/Data/Car Price/toyota.csv')

In [3]:
all_data = {'audi': data_audi, 'bmw': data_bmw, 'ford': data_ford, 'hyundi': data_hyundi, 'merc': data_merc, 'skoda': data_skoda, 'toyota': data_toyota}

In [4]:
# Add new feature "brand"
for key, dataset in all_data.items():
    dataset['brand'] = key

# Merge all dataframes into one
data_temp = pd.DataFrame(columns=data_audi.columns)

for key, df in all_data.items():
    data_temp = data_temp.append(df, ignore_index=True)

# Shuffle final dataframe
data = data_temp.sample(frac=1).reset_index(drop=True)

In [5]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,GLA Class,2016,18400,Automatic,49000,Petrol,200,43.5,2.0,merc
1,A6,2015,18495,Semi-Auto,46482,Diesel,200,44.8,3.0,audi
2,3 Series,2018,20498,Semi-Auto,19620,Petrol,145,48.7,2.0,bmw
3,Yaris,2013,5791,Manual,45000,Petrol,125,51.4,1.3,toyota
4,Octavia,2017,11900,Manual,47939,Petrol,30,55.4,1.4,skoda
...,...,...,...,...,...,...,...,...,...,...
70393,C Class,2017,37199,Semi-Auto,5376,Petrol,145,35.3,3.0,merc
70394,1 Series,2017,15490,Automatic,26000,Diesel,125,61.4,2.0,bmw
70395,Fiesta,2013,6140,Manual,35110,Petrol,30,54.3,1.2,ford
70396,Q5,2014,23995,Semi-Auto,35025,Diesel,260,41.5,3.0,audi


In [29]:
data.describe()

Unnamed: 0,mpg,engineSize
count,70398.0,70398.0
mean,56.174424,1.724401
std,17.421733,0.605706
min,1.1,0.0
25%,47.9,1.2
50%,56.5,1.6
75%,64.2,2.0
max,470.8,6.6


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70398 entries, 0 to 70397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         70398 non-null  object 
 1   year          70398 non-null  object 
 2   price         70398 non-null  object 
 3   transmission  70398 non-null  object 
 4   mileage       70398 non-null  object 
 5   fuelType      70398 non-null  object 
 6   tax           70398 non-null  object 
 7   mpg           70398 non-null  float64
 8   engineSize    70398 non-null  float64
 9   brand         70398 non-null  object 
dtypes: float64(2), object(8)
memory usage: 5.4+ MB


# Data cleaning

## Outliers

In [51]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['price'],
    marker_color='rgb(9,56,125)',
    name = 'Price'
))

fig.update_layout(
    title_text = 'Price box plot',
    template = 'plotly_dark'
)

There are many points that looks like outliers but for me it is completely normal that some cars are very expensive. There is no need to delete these records.

In [54]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mileage'],
    marker_color='rgb(128, 0, 0)',
    name = 'Mileage'
))

fig.update_layout(
    title_text = 'Mileage box plot',
    template = 'plotly_dark'
)

In [56]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['tax'],
    marker_color='rgb(0, 153, 51)',
    name = 'Tax'
))

fig.update_layout(
    title_text = 'Tax box plot',
    template = 'plotly_dark'
)

Tax and mileage looks strange but it is still something I can believe in

In [58]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mpg'],
    marker_color='rgb(153, 92, 0)',
    name = 'Miles per Gallon'
))

fig.update_layout(
    title_text = 'Miles per Gallon box plot',
    template = 'plotly_dark'
)

Tax and mileage looks strange but it is still something I can believe in but these outliers are completely unrealistic. I am going to delete this car with 470 mpg value. 

In [59]:
data = data[data['mpg'] < 470]

In [60]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data['mpg'],
    marker_color='rgb(153, 92, 0)',
    name = 'Miles per Gallon'
))

fig.update_layout(
    title_text = 'Miles per Gallon box plot',
    template = 'plotly_dark'
)

# Single variable analysis

In [6]:
fig = go.Figure()

to_plot = data['year'].value_counts()

fig.add_trace(go.Bar(
    x = to_plot.index,
    y = to_plot.values
))

fig.update_layout(
    title_text = 'Number of cars from every year',
    template = 'plotly_dark',
    xaxis_title = 'Year',
    yaxis_title = 'Number of cars'
)

In [7]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['mileage'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of mileage feature',
    template = 'plotly_dark',
    xaxis_title = 'Mileage'
)

In [8]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['price'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of price feature',
    template = 'plotly_dark',
    xaxis_title = 'Price'
)

In [9]:
fig = go.Figure()

night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)',
                'rgb(36, 55, 57)', 'rgb(6, 4, 4)']

to_plot = data['transmission'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent',
    marker_colors = night_colors
))

fig.update_layout(
    title_text = 'Transmission Pie Plot',
    template = 'plotly_dark'
)

In [10]:
fig = go.Figure()

to_plot = data['brand'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent'
))

fig.update_layout(
    title_text = 'Brand Pie Plot',
    template = 'plotly_dark'
)

In [11]:
fig = go.Figure()

to_plot = data['fuelType'].value_counts()

fig.add_trace(go.Bar(
    y = to_plot.index,
    x = to_plot.values,
    orientation = 'h',
    text = to_plot.values,
    textposition = 'outside',
))

fig.update_layout(
    autosize = False,
    width = 850,
    template = 'plotly_dark',
    title_text = 'Fuel Type Bar Plot'
)

# Multiple variables analysis

In [12]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,GLA Class,2016,18400,Automatic,49000,Petrol,200,43.5,2.0,merc
1,A6,2015,18495,Semi-Auto,46482,Diesel,200,44.8,3.0,audi
2,3 Series,2018,20498,Semi-Auto,19620,Petrol,145,48.7,2.0,bmw
3,Yaris,2013,5791,Manual,45000,Petrol,125,51.4,1.3,toyota
4,Octavia,2017,11900,Manual,47939,Petrol,30,55.4,1.4,skoda
...,...,...,...,...,...,...,...,...,...,...
70393,C Class,2017,37199,Semi-Auto,5376,Petrol,145,35.3,3.0,merc
70394,1 Series,2017,15490,Automatic,26000,Diesel,125,61.4,2.0,bmw
70395,Fiesta,2013,6140,Manual,35110,Petrol,30,54.3,1.2,ford
70396,Q5,2014,23995,Semi-Auto,35025,Diesel,260,41.5,3.0,audi


### Pearson
<b>The Pearson correlation coefficient<b> is a measure of the linear relationship between two features. It's the ratio of the covariance of x and y to the product of their standard deviations. It's often denoted with the letter r and called Pearson's r.

### Spearman
The Spearman correlation coefficient between two features is the Pearson correlation coefficient between their rank values. It's calculated the same ways as the Pearson correlation coefficient but takes into account their ranks instead of their values.

### Kendall
The Kendall correlation coefficient compares the number of concordant and discordant pairs of data. This coefficient is basen on the differce in the counts of concordant and discordant pairs relative to the number of x-y paris.

concordant if either (xᵢ > xⱼ and yᵢ > yⱼ) or (xᵢ < xⱼ and yᵢ < yⱼ) <br>
discordant if either (xᵢ < xⱼ and yᵢ > yⱼ) or (xᵢ > xⱼ and yᵢ < yⱼ) <br>
neither if there’s a tie in x (xᵢ = xⱼ) or a tie in y (yᵢ = yⱼ)

In [13]:
def correlation(x, y):
    """
    x, y - two lists of features
    This function prints Pearson, Spearman
    """

    # Pearson's r
    pearson_r = np.corrcoef(x, y)[0, 1]

    # Spearman's rho
    spearman_rho = stats.spearmanr(x, y).correlation

    # Kendall's tau
    kendall_tau = stats.kendalltau(x, y).correlation

    print('Pearson correlation coefficient: {}'.format(pearson_r))
    print('Spearman correlation coefficient: {}'.format(spearman_rho))
    print('Kendall correlation coefficient: {}'.format(kendall_tau))

In [14]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mileage'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Mileage x Price Scatter Plot',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Mileage'
)

In [15]:
correlation(data['price'].tolist(), data['mileage'].tolist())

Pearson correlation coefficient: -0.4234373700626375
Spearman correlation coefficient: -0.5192982902355282
Kendall correlation coefficient: -0.359678124497522


In [16]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mpg'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Miles per gallon',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Miles per gallon'
)

In [17]:
correlation(data['price'].tolist(), data['mpg'].tolist())

Pearson correlation coefficient: -0.32246423993037066
Spearman correlation coefficient: -0.458394846335759
Kendall correlation coefficient: -0.3214906843604652


In [18]:
data_price_float = data.copy()
data_price_float['price'] = data_price_float['price'].map(lambda x: float(x))

In [31]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='transmission', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['transmission'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by transmission type',
    template = 'plotly_dark'
)

In [33]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='fuelType', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['fuelType'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by fuel type',
    template = 'plotly_dark'
)

In [34]:
fig = go.Figure()

to_plot = data_price_float.groupby(by='brand', as_index=False).mean()

fig.add_trace(go.Bar(
    y = to_plot['brand'],
    x = to_plot['price'],
    orientation='h'
))

fig.update_layout(
    title_text = 'Price by brand',
    template = 'plotly_dark'
)

# Machine Learning