In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
cars = pd.read_csv("/kaggle/input/used-cars-listing-from-ebay/autos_new.csv")
cars

The granularity of this data is quite fine. Each entry in this table represents a used car listing, in addition to the various attributes for each car. Let's explore the distribution of used car prices in this dataset.

In [None]:
sns.distplot(cars['dollar_price'], bins = np.arange(0, 250000, 5000), rug = True)

Based on this used cars dataset, we can observe that the majority of used cars cost less than 50000. This is indicated by the high density of the rug plot for lower values of "dollar_price", and the tallest bar being around the 20000 value.

In [None]:
cars['vehicle_type'].value_counts()

There are six categories of types of cars, thus vehicle_type is a qualitative nominal variable.

In [None]:
cars.loc[cars['gearbox'] == 'manuell', 'gearbox'] = 'manual'
cars.loc[cars['gearbox'] == 'automatik', 'gearbox'] = 'automatic'
cars['gearbox'].value_counts() 

In [None]:
cars['fuel_type'].value_counts()

In [None]:
sns.distplot(cars['registration_year'],rug = True)

Based on this histogram, which is trimodal and skewed to lower registration years(to the left), indicates that Used Cars were registered between 1995 to 2018 mostly. 

In [None]:
cars.loc[cars['unrepaired_damage'] == 'ja', 'unrepaired_damage'] = 'yes'
cars.loc[cars['unrepaired_damage'] == 'nein', 'unrepaired_damage'] = 'no'
cars['unrepaired_damage'].value_counts() 

In [None]:
cars['brand'].value_counts()

Volkswagen, BMW, and Mercedes seem to be the most common used car brands within this dataset, which we can use to safely deduce that this is a German dataset. 
Particularly, this is because of the use of words "ja" and "nein" meaning "yes" and "no" respectively in German in regards to whether the vehicle has unrepaired damage.

Let's explore the association between used car mileage and age of the car.

In [None]:
extract_year = lambda year: year[:4]
cars['year'] = cars['date_crawled'].apply(extract_year)
cars['year'] #All of these listings were added in the year 2016

In [None]:
cars['age'] = cars['year'].astype(int) - cars['registration_year']
cars['age']

In [None]:
sns.distplot(cars['age'], rug = True)
plt.xlabel('age(years)')
plt.ylabel('percent per age(years)')

The average age of any given used car in this dataset(which is representative of the population of cars) is around 10 years old. This distribution is trimodal and skewed to the right(higher age numbers).

In [None]:
print("The average age of cars is", np.mean(cars['age']), "years old")
print("The range of car ages is", np.max(cars['age']) - np.min(cars['age']), 'years')

Can age and mileage be useful features for predicting the price of used cars on average? Let's first explore the association between mileage and age.

In [None]:
sns.scatterplot('age', 'kilometer', data = cars)

There appears to be no association whatsoever between age and mileage.

In [None]:
cars.head()

Is age of the car associated with dollar price?

In [None]:
sns.scatterplot('age', 'dollar_price', data = cars) #No association here either. 

What is the average price per type of car?

In [None]:
vehicle_type_avg_dollar_price = cars[['vehicle_type', 'dollar_price']].groupby('vehicle_type', as_index = False).agg(np.mean)
vehicle_type_avg_dollar_price

On average, used coupes and convertibles are more expensive.

In [None]:
bus = cars[cars['vehicle_type'] == 'bus']
convertible = cars[cars['vehicle_type'] == 'convertible']
coupe = cars[cars['vehicle_type'] == 'coupé']
limousine = cars[cars['vehicle_type'] == 'limousine']
small_car = cars[cars['vehicle_type'] == 'small car']
station_wagon = cars[cars['vehicle_type'] == 'station wagon']

In [None]:
bus.head()

In [None]:
convertible.head()

In [None]:
coupe.head()

In [None]:
limousine.head()

In [None]:
small_car.head()

In [None]:
station_wagon.head()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt


fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(bus['kilometer'], bus['age'], bus['dollar_price'])

ax.set_xlabel('mileage (kilometers)')
ax.set_ylabel('age (years)')
ax.set_zlabel('dollar_price')

plt.show()

In [None]:
bus_rm = bus[(bus['power_ps'] < 1000) & (bus['power_ps'] > 20)]
sns.scatterplot(bus_rm['power_ps'], bus_rm['dollar_price'])

There appears to be a moderate linear association between power_ps(the explanatory variable) and dollar_price(the response variable). 

In [None]:
convertible_rm = convertible[(convertible['power_ps'] < 1000) & (convertible['power_ps'] > 20)]
sns.scatterplot(convertible_rm['power_ps'], convertible_rm['dollar_price'])

There appears to be a moderate linear association between power_ps(the explanatory variable) and dollar_price(the response variable).

In [None]:
coupe_rm = coupe[(coupe['power_ps'] < 1000) & (coupe['power_ps'] > 20)]
sns.scatterplot(coupe_rm['power_ps'], coupe_rm['dollar_price'])

There appears to be a moderate linear association between power_ps(the explanatory variable) and dollar_price(the response variable).

In [None]:
limousine_rm = limousine[(limousine['power_ps'] < 1000) & (limousine['power_ps'] > 20)]
sns.scatterplot(limousine_rm['power_ps'], limousine_rm['dollar_price'])

There appears to be a moderate linear association between power_ps(the explanatory variable) and dollar_price(the response variable).

In [None]:
small_car_rm = small_car[(small_car['power_ps'] < 1000) & (small_car['power_ps'] > 20)]
sns.scatterplot(small_car_rm['power_ps'], small_car_rm['dollar_price'])

There appears to be a weak linear association between power_ps(the explanatory variable) and dollar_price(the response variable).

In [None]:
station_wagon_rm = station_wagon[(station_wagon['power_ps'] < 1000) & (station_wagon['power_ps'] > 20)]
sns.scatterplot(station_wagon_rm['power_ps'], station_wagon_rm['dollar_price'])

There appears to be a moderate linear association between power_ps(the explanatory variable) and dollar_price(the response variable).

Predicting Prices of bus

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X = bus[['power_ps']]
y = bus['dollar_price']
bus_price_predictions = LinearRegression().fit(X, y)
y_pred = bus_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)

Predicting Prices of convertible

In [None]:
X = convertible[['power_ps']]
y = convertible['dollar_price']
convertible_price_predictions = LinearRegression().fit(X, y)
y_pred = convertible_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)

Predicting Prices of coupe

In [None]:
X = coupe[['power_ps']]
y = coupe['dollar_price']
coupe_price_predictions = LinearRegression().fit(X, y)
y_pred = coupe_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)

Predicting Prices of limousine

In [None]:
X = limousine[['power_ps']]
y = limousine['dollar_price']
limousine_price_predictions = LinearRegression().fit(X, y)
y_pred = limousine_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)

Predicting Prices of small_car

In [None]:
X = small_car[['power_ps']]
y = small_car['dollar_price']
small_car_price_predictions = LinearRegression().fit(X, y)
y_pred = small_car_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)

Predicting Prices of station_wagon

In [None]:
X = station_wagon[['power_ps']]
y = station_wagon['dollar_price']
station_wagon_price_predictions = LinearRegression().fit(X, y)
y_pred = station_wagon_price_predictions.predict(X)
print(np.sqrt(r2_score(y, y_pred)))
mean_squared_error(y, y_pred)