In [2]:
import pandas as pd
import numpy as np

from scipy import stats

import plotly.graph_objects as go

In [3]:
data_audi = pd.read_csv('D:/Data/Car Price/audi.csv')
data_bmw = pd.read_csv('D:/Data/Car Price/bmw.csv')
data_ford = pd.read_csv('D:/Data/Car Price/ford.csv')
data_hyundi = pd.read_csv('D:/Data/Car Price/hyundi.csv').rename(columns= {'tax(£)': 'tax'})
data_merc = pd.read_csv('D:/Data/Car Price/merc.csv')
data_skoda = pd.read_csv('D:/Data/Car Price/skoda.csv')
data_toyota = pd.read_csv('D:/Data/Car Price/toyota.csv')

In [4]:
all_data = {'audi': data_audi, 'bmw': data_bmw, 'ford': data_ford, 'hyundi': data_hyundi, 'merc': data_merc, 'skoda': data_skoda, 'toyota': data_toyota}

In [5]:
# Add new feature "brand"
for key, dataset in all_data.items():
    dataset['brand'] = key

# Merge all dataframes into one
data_temp = pd.DataFrame(columns=data_audi.columns)

for key, df in all_data.items():
    data_temp = data_temp.append(df, ignore_index=True)

# Shuffle final dataframe
data = data_temp.sample(frac=1).reset_index(drop=True)

In [6]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,Yaris,2017,8495,Manual,28000,Diesel,0,80.7,1.4,toyota
1,C Class,2017,20000,Manual,28831,Diesel,145,65.7,2.1,merc
2,Fiesta,2014,7350,Semi-Auto,37500,Petrol,0,57.7,1.0,ford
3,Focus,2019,18499,Manual,4945,Petrol,145,50.4,1.5,ford
4,Avensis,2017,11645,Manual,22640,Diesel,20,67.3,1.6,toyota
...,...,...,...,...,...,...,...,...,...,...
70393,3 Series,2015,13798,Semi-Auto,59936,Diesel,125,58.9,2.0,bmw
70394,X3,2018,30000,Semi-Auto,71587,Diesel,145,54.3,2.0,bmw
70395,Fiesta,2017,8000,Manual,19501,Petrol,150,65.7,1.0,ford
70396,Mondeo,2018,14399,Automatic,22645,Diesel,145,45.6,2.0,ford


# Single variable analysis

In [7]:
fig = go.Figure()

to_plot = data['year'].value_counts()

fig.add_trace(go.Bar(
    x = to_plot.index,
    y = to_plot.values
))

fig.update_layout(
    title_text = 'Number of cars from every year',
    template = 'plotly_dark',
    xaxis_title = 'Year',
    yaxis_title = 'Number of cars'
)

In [8]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['mileage'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of mileage feature',
    template = 'plotly_dark',
    xaxis_title = 'Mileage'
)

In [9]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = data['price'],
    nbinsx = 170
))

fig.update_layout(
    title_text = 'Distribution of price feature',
    template = 'plotly_dark',
    xaxis_title = 'Price'
)

In [10]:
fig = go.Figure()

night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)',
                'rgb(36, 55, 57)', 'rgb(6, 4, 4)']

to_plot = data['transmission'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent',
    marker_colors = night_colors
))

fig.update_layout(
    title_text = 'Transmission Pie Plot',
    template = 'plotly_dark'
)

In [11]:
fig = go.Figure()

to_plot = data['brand'].value_counts()

fig.add_trace(go.Pie(
    values = to_plot.values,
    labels = to_plot.index,
    textinfo = 'label+percent'
))

fig.update_layout(
    title_text = 'Brand Pie Plot',
    template = 'plotly_dark'
)

In [12]:
fig = go.Figure()

to_plot = data['fuelType'].value_counts()

fig.add_trace(go.Bar(
    y = to_plot.index,
    x = to_plot.values,
    orientation = 'h',
    text = to_plot.values,
    textposition = 'outside',
))

fig.update_layout(
    autosize = False,
    width = 850,
    template = 'plotly_dark',
    title_text = 'Fuel Type Bar Plot'
)

# Multiple variables analysis

In [13]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,Yaris,2017,8495,Manual,28000,Diesel,0,80.7,1.4,toyota
1,C Class,2017,20000,Manual,28831,Diesel,145,65.7,2.1,merc
2,Fiesta,2014,7350,Semi-Auto,37500,Petrol,0,57.7,1.0,ford
3,Focus,2019,18499,Manual,4945,Petrol,145,50.4,1.5,ford
4,Avensis,2017,11645,Manual,22640,Diesel,20,67.3,1.6,toyota
...,...,...,...,...,...,...,...,...,...,...
70393,3 Series,2015,13798,Semi-Auto,59936,Diesel,125,58.9,2.0,bmw
70394,X3,2018,30000,Semi-Auto,71587,Diesel,145,54.3,2.0,bmw
70395,Fiesta,2017,8000,Manual,19501,Petrol,150,65.7,1.0,ford
70396,Mondeo,2018,14399,Automatic,22645,Diesel,145,45.6,2.0,ford


### Pearson
<b>The Pearson correlation coefficient<b> is a measure of the linear relationship between two features. It's the ratio of the covariance of x and y to the product of their standard deviations. It's often denoted with the letter r and called Pearson's r.

### Spearman
The Spearman correlation coefficient between two features is the Pearson correlation coefficient between their rank values. It's calculated the same ways as the Pearson correlation coefficient but takes into account their ranks instead of their values.

### Kendall
The Kendall correlation coefficient compares the number of concordant and discordant pairs of data. This coefficient is basen on the differce in the counts of concordant and discordant pairs relative to the number of x-y paris.

concordant if either (xᵢ > xⱼ and yᵢ > yⱼ) or (xᵢ < xⱼ and yᵢ < yⱼ) <br>
discordant if either (xᵢ < xⱼ and yᵢ > yⱼ) or (xᵢ > xⱼ and yᵢ < yⱼ) <br>
neither if there’s a tie in x (xᵢ = xⱼ) or a tie in y (yᵢ = yⱼ)

In [32]:
def correlation(x, y):
    """
    x, y - two lists of features
    This function prints Pearson, Spearman
    """

    # Pearson's r
    pearson_r = np.corrcoef(x, y)[0, 1]

    # Spearman's rho
    spearman_rho = stats.spearmanr(x, y).correlation

    # Kendall's tau
    kendall_tau = stats.kendalltau(x, y).correlation

    print('Pearson correlation coefficient: {}'.format(pearson_r))
    print('Spearman correlation coefficient: {}'.format(spearman_rho))
    print('Kendall correlation coefficient: {}'.format(kendall_tau))

In [33]:
correlation(data['price'].tolist(), data['mileage'].tolist())

Pearson correlation coefficient: -0.4234373700626372
Spearman correlation coefficient: -0.5192982902355282
Kendall correlation coefficient: -0.359678124497522


In [15]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mileage'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Mileage x Price Scatter Plot',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Mileage'
)

In [16]:
fig = go.Figure()

to_plot = data.sample(frac=0.10, random_state=42)

fig.add_trace(go.Scatter(
    x = to_plot['price'],
    y = to_plot['mpg'],
    mode = 'markers'
))

fig.update_layout(
    title_text = 'Miles per gallon',
    template = 'plotly_dark',
    xaxis_title = 'Price',
    yaxis_title = 'Miles per gallon'
)