In [1]:
import pandas as pd
import numpy as np
from datetime import date

In [2]:
# constants
FILE_PATH = "cars.csv"

## Data

This dataset was retrieved from [kaggle](https://www.kaggle.com/datasets/juanmerinobermejo/us-sales-cars-dataset). It provides comprehensive information about used cars available for sale in the United States. The dataset contains the following key attributes:

- **Brand:** The brand or manufacturer of the car.
- **Model:** The specific model of the car.
- **Mileage:** The number of miles the car has been driven.
- **Year:** The manufacturing year of the car.
- **Status:** Indicates whether the car is new, used, or certified pre-owned.
- **Dealer:** Information about the dealer or seller offering the car.
- **Price:** The listed price of the car in USD.

In [3]:
df = pd.read_csv(FILE_PATH, encoding="utf-16")
df.head()

Unnamed: 0,Brand,Model,Year,Status,Mileage,Dealer,Price
0,Mazda,CX-5,2023,New,,,36703.0
1,Kia,Sportage,2023,New,,Classic Kia,28990.0
2,Chevrolet,Camaro,2024,New,,Classic Chevrolet Beaumont,41425.0
3,Ford,Bronco,2023,Used,1551.0,Mike Smith Chrysler Dodge Jeep RAM,58900.0
4,Acura,TLX,2021,Used,30384.0,Mike Smith Nissan,34499.0


## Preprocessing the Data

To clean the data we will:
1. data cleaning
2. data transformation
3. feature reduction 

In [4]:
row_count = len(df)
print(f"Total Rows: {len(df)}")
df.isnull().sum().map(lambda sum: f"{sum / row_count:.2%}")

Total Rows: 51793


Brand       0.00%
Model       0.00%
Year        0.00%
Status      0.00%
Mileage    55.63%
Dealer      0.20%
Price       2.22%
dtype: object

### Missing Values
The statistics above indicate the percentage of missing data under each column. Many of the rows are missing entries for mileage. We can drop the rows with missing values for mileage and price.

In [5]:
df = df.dropna()
print(f"Total Rows: {len(df)}")

Total Rows: 22974


### Remove duplicates

In [6]:
df = df.drop_duplicates()
print(f"Total Rows: {len(df)}")

Total Rows: 22935


### Data Transformation / Normalization
- we should scale certain features (like year to age)
- we should check for any typos or inconsistencies in text

In [7]:
df["Age"] = date.today().year - df.Year

In [None]:
for brand in df.Brand.unique():
    print(brand, end=",\t")

Ford,	Acura,	Volkswagen,	GMC,	Infiniti,	Lexus,	Toyota,	Mazda,	Honda,	Lincoln,	Mercury,	Cadillac,	BMW,	Chevrolet,	Hyundai,	Land Rover,	Audi,	Mercedes,	Jeep,	Porsche,	Kia,	MINI,	Nissan,	Subaru,	RAM,	Maserati,	Tesla,	Bentley,	Dodge,	Mitsubishi,	FIAT,	Rivian,	Genesis,	Aston Martin,	Buick,	Jaguar,	Lamborghini,	Hummer,	Saturn,	Rolls-Royce,	Volvo,	Ferrari,	Polestar,	Lucid,	Alfa Romeo,	Scion,	Chrysler,	Pontiac,	Karma,	Smart,	McLaren,	Lotus,	Saab,	Maybach,	Suzuki,	International Scout,	Geo,	Oldsmobile,	Isuzu,	

In [None]:
for brand in df.Model.unique():
    print(brand, end=",\t")

Bronco,	TLX,	Golf,	Yukon,	QX30,	ES,	Sierra,	Tundra,	CX-5,	Accord,	Atlas,	Sequoia,	Continental,	Marquis,	IS,	Mustang,	LS,	Nautilus,	CT5,	Escalade,	MDX,	435 i,	ILX,	Palisade,	X7,	Escape,	Sonata,	Land Cruiser,	Navigator,	Range Rover,	RS 5,	E-Class,	Grand Cherokee,	GLS 600,	Highlander,	Wagoneer,	GLS 450,	Cayenne,	Q7,	TT,	Wrangler,	F-150,	F-250,	Defender,	QX80,	QX60,	Essentia,	GLE,	Panamera,	Telluride,	Forte,	Cooper,	A7,	Juke,	CT6,	WRX,	Camaro,	Crosstrek,	Town Car,	Fusion,	2500,	Q5,	Discovery,	G 550,	e-tron,	Ghibli,	Tahoe,	6,	CR-V,	Elantra,	GX,	Passport,	Odyssey,	Explorer,	Corvette,	C-Class,	F-350,	X4,	Civic,	3,	Suburban,	Fiesta,	GT,	Model X,	Flying Spur,	Roadster,	xD,	Tucson,	RX,	Macan,	Taycan,	RDX,	Outback,	Challenger,	Ascent,	Tacoma,	1,	LX,	X5,	SQ5,	540 i,	A8,	QX50,	S3,	1500,	Sportage,	Prius,	Camry,	Aviator,	Corolla,	Colorado,	Mirage,	Terrain,	3500,	Renegade,	Silverado,	Model S,	86,	500,	Model Y,	Altima,	Viper,	R1S,	Ioniq,	Tiguan,	Model 3,	Durango,	Expedition,	CX-30,	Mazda3,	CX-9,	CT4-V,

In [None]:
df.describe()

Unnamed: 0,Year,Mileage,Price,Age
count,22935.0,22935.0,22935.0,22935.0
mean,2018.095487,53462.804404,39230.89,4.904513
std,4.985611,44560.944849,34215.93,4.985611
min,1959.0,0.0,1990.0,-1.0
25%,2016.0,20619.0,21995.0,2.0
50%,2020.0,42140.0,32959.0,3.0
75%,2021.0,73971.0,46984.5,7.0
max,2024.0,400396.0,1299995.0,64.0
