## Prediction with CART – data preparation
Case studies:                     
  - CH15A Predicting used car value with regression trees 
                                   
Dataset:

    used-cars

In [None]:
import os
import warnings

import numpy as np
import pandas as pd
from skimpy import skim

warnings.filterwarnings("ignore")

Import data

In [None]:
data = pd.read_csv("https://osf.io/7gvz9/download")

In [None]:
data.head()

In [None]:
data.shape

### Sample design

Manage missing

In [None]:
data["fuel"] = data["fuel"].fillna("Missing")
data["drive"] = data["drive"].fillna("Missing")
data["cylinders"] = data["cylinders"].fillna("Missing")
data["transmission"] = data["transmission"].fillna("Missing")
data["type"] = data["type"].fillna("Missing")

Missing changed to good not missing for condition

In [None]:
data["condition"].value_counts()


In [None]:
data["condition"] = data["condition"].fillna("good")


In [None]:
data["condition"].value_counts()


Drop hybrid models then drop column

In [None]:
data = data.loc[lambda x: x["Hybrid"] == 0].drop("Hybrid", axis=1)

Keep gas-fuelled vehicles

In [None]:
data = data.loc[lambda x: x["fuel"] == "gas"]

Drop vehicles in fair and new condition, trucks

In [None]:
data = data.loc[lambda x: ~x["condition"].isin(["new","fair"])]

Drop unrealistic values for price and odometer reading


In [None]:
data = data.loc[lambda x: (x["price"].isin(range(500, 25001))) & (x["odometer"] <= 100)]

Drop if price is smaller than 1000 and condition is like new or age is less than 8


In [None]:
data = data.loc[
    lambda x: ~((x["price"] < 1000) & ((x["condition"] == "like new") | (x["age"] < 8)))
]

In [None]:
data = data.loc[lambda x: x["transmission"] != "manual"]

Drop if truck

In [None]:
data = data.loc[lambda x: ~x["type"].isin(["truck", "pickup"])]

Drop price string

In [None]:
data = data.drop("pricestr",axis=1)

To be on the safe side, drop NA prices

In [None]:
data = data.loc[lambda x: x["price"].notna()]

### Data generation & descriptives

Variables we are interested in:
   
       price age odometer + condition cylinder dealer city LE

Condition

In [None]:
data["cond_excellent"] = np.where(data["condition"] == "excellent", 1, 0)
data["cond_good"] = np.where(data["condition"] == "good", 1, 0)
data["cond_likenew"] = np.where(data["condition"] == "like new", 1, 0)

Cylinders

In [None]:
data.cylinders.value_counts()

In [None]:
data["cylind6"] = np.where(data["cylinders"] == "6 cylinders", 1, 0)
data.cylind6.value_counts()

Chicago


In [None]:
data["chicago"] = np.where(data["area"] == "chicago", 1, 0)

age: quadratic, cubic

In [None]:
data["agesq"] = data["age"] ** 2
data["agecu"] = data["age"] ** 3

odometer quadratic

In [None]:
data["odometersq"] = data["odometer"] ** 3

Take a look at descrpitives

In [None]:
skim(data)

In [None]:
data["price"].describe()

In [None]:
data["price"].hist()

In [None]:
data["price"].apply(np.log).hist()

Save data for prediction

In [None]:
os.makedirs("data", exist_ok=True)
data.to_csv("data/usedcars_cart_work.csv", index=False)