## Import libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')

## Load and read data

In [None]:
df = pd.read_csv('./data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

## Exploratory Data Analysis (EDA)

### Cleaning data

In [None]:
df.dropna(subset='Price', inplace=True)
df.isnull().sum()

For columns **Car/Suv**, **BodyType**, the number of null is small, we consider to removing the rows with null values

In [None]:
dropna_cols = ['Car/Suv', 'BodyType']

df.dropna(subset=dropna_cols, inplace=True)

df.isnull().sum()

#### Location

Since there are 439 null value in **Location** column, removing these rows may result in a significant loss of data. Alternatively, we can fill the null values with a `Unknown` value.

In [None]:
df['Location'].fillna(value='Unknown', inplace=True)

#### Doors and Seats

Extract the integer value from **Doors** and **Seats** columns, fill null value with mean and round them

In [None]:
df['Doors'] = df['Doors'].str.extract('(\d+)').astype(float)
df['Seats'] = df['Seats'].str.extract('(\d+)').astype(float)

# fillna with round mean
df['Doors'].fillna(value=round(df['Doors'].mode().iloc[0]), inplace=True)
df['Seats'].fillna(value=round(df['Seats'].mode().iloc[0]), inplace=True)

In [None]:
df.isnull().sum()

# assert all values are not null
assert df.isnull().sum().sum() == 0

### Check duplicates

In [None]:
df.duplicated().sum()

# assert no duplicates
assert df.duplicated().sum() == 0

### Convert column data type

In [None]:
df.info()

Convert the **Kilometres**, **Price** columns to a numerical data type (float) and **Year** column to integer type.

In [None]:
df['Kilometres'] = pd.to_numeric(df['Kilometres'], errors='coerce')
df["Price"] = pd.to_numeric(df['Price'], errors='coerce')
df['Year'] = df['Year'].astype(int)

In [None]:
df.info()

Extract the numerical values from the **FuelConsumption**, **CylindersinEngine**, **Engine** column to convert it into a numerical data type (e.g., float). This will allow us to perform calculations or comparisons based on fuel consumption.

In [None]:
df['FuelConsumption'] = df['FuelConsumption'].str.extract('([\d.]+) L / 100 km').astype(float)
df['CylindersinEngine'] = df['CylindersinEngine'].str.extract('(\d+)').astype(float)
df['Engine'] = df['Engine'].str.extract('([\d.]+)').astype(float)

Replace non-sense value of **Transmission** and **FuelType** with `Other`

In [None]:
df['Transmission'] = df['Transmission'].replace('-', 'Other')
df['FuelType'] = df['FuelType'].replace('-', 'Other')

In [None]:
df.head()

### Insight plot

#### Histograms of Numerical Features

In [None]:
df.describe()

In [None]:
df.hist(bins=20, figsize=(20, 10))
plt.suptitle('Histograms of Numerical Features')
plt.show()

#### Brand Distribution

In [None]:
plt.figure(figsize=(20, 6))
brand_counts = df['Brand'].value_counts()
plt.bar(brand_counts.index, brand_counts)
plt.title('Brand Distribution')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right')
plt.show()

#### Transmission Type Distribution

In [None]:
transmission_counts = df['Transmission'].value_counts()
plt.bar(transmission_counts.index, transmission_counts)
plt.title('Transmission Type Distribution')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

#### Fuel Type Distribution

In [None]:
plt.figure(figsize=(20, 6))

fuel_counts = df['FuelType'].value_counts()
plt.bar(fuel_counts.index, fuel_counts)
plt.title('Fuel Type Distribution')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()

#### Used or New Car distribution

In [None]:
used_vs_new_counts = df['UsedOrNew'].value_counts()
plt.bar(used_vs_new_counts.index, used_vs_new_counts)
plt.title('Used vs. New Cars')
plt.xlabel('Used or New')
plt.ylabel('Count')
plt.show()

#### Fuel Consumption Distribution

In [None]:
plt.hist(df['FuelConsumption'], bins=20, edgecolor='black')
plt.title('Fuel Consumption Distribution')
plt.xlabel('Fuel Consumption (L/100km)')
plt.ylabel('Frequency')
plt.show()

#### Car price distribution

In [None]:
plt.hist(df['Price'], bins=20, color='salmon', edgecolor='black')
plt.title('Car Price Distribution')
plt.xlabel('Price (AUD)')
plt.ylabel('Frequency')
plt.show()

#### BodyType distribution

In [None]:
plt.figure(figsize=(20, 6))
brand_counts = df['BodyType'].value_counts()
plt.bar(brand_counts.index, brand_counts)
plt.title('Body Type Distribution')
plt.xlabel('Body Type')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right')
plt.show()

## Question

What is the distribution of vehicle age (based on the year column) for new versus used vehicles?

In [None]:
current_year = datetime.now().year
df['VehicleAge'] = current_year - df['Year']

# Separate the dataset into new and used vehicles
new_vehicles = df[df['UsedOrNew'].str.lower() == 'new']
used_vehicles = df[df['UsedOrNew'].str.lower() == 'used']

# Plot the distribution of vehicle age for new and used vehicles
plt.figure(figsize=(12, 6))

# Plot for new vehicles
plt.subplot(2, 1, 1)
new_vehicles['VehicleAge'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Vehicle Age for New Vehicles')
plt.xlabel('Vehicle Age (years)')
plt.ylabel('Frequency')

# Plot for used vehicles
plt.subplot(2, 1, 2)
used_vehicles['VehicleAge'].value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Distribution of Vehicle Age for Used Vehicles')
plt.xlabel('Vehicle Age (years)')
plt.ylabel('Frequency')

# Show the plots
plt.tight_layout()
plt.show()