# Creating fake data for car_sales (to make it a bit bigger)

This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers.

In [1]:
import pandas as pd
import numpy as np

car_sales = pd.read_csv('../data/car-sales.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/car-sales.csv'

In [None]:
car_sales

In [None]:
car_sales.Make.unique()

In [None]:
car_sales.Make.value_counts()

## Create fake "Make" data

In [None]:
# Create fake "Make" data

toyota = ["Toyota" for i in range(0, 393)]
len(toyota), toyota[:10]

In [None]:
honda = ["Honda" for i in range(0, 304)]
len(honda), honda[:10]

In [None]:
nissan = ["Nissan" for i in range(0, 198)]
len(nissan), nissan[:10]

In [None]:
bmw = ["BMW" for i in range(0, 100)]
len(bmw), bmw[:10]

In [None]:
makes = bmw+nissan+toyota+honda
len(makes)

## Create fake "Colour" data

In [None]:
car_sales.Colour.unique()

In [None]:
car_sales.Colour.value_counts()

In [None]:
white = ["White" for i in range(0, 407)]
len(white), white[:3]

In [None]:
blue = ["Blue" for i in range(0, 321)]
len(blue), blue[:3]

In [None]:
green = ["Green" for i in range(0, 79)]
len(green), green[:3]

In [None]:
black = ["Black" for i in range(0, 99)]
len(black), black[:3]

In [None]:
red = ["Red" for i in range(0, 94)]
len(red), red[:3]

In [None]:
colours = white+blue+green+black+red
len(colours)

In [None]:
import random
colours_shuffled = random.sample(colours, len(colours))
len(colours_shuffled), colours_shuffled[:10]

## Create fake Odometer (KM) data

In [None]:
car_sales

In [None]:
odometer = [random.randint(9789, 250000) for i in range(0, 1000)]
len(odometer), odometer[:10]

## Create fake "Doors" data

In [None]:
five_doors = [5 for i in range(0, 79)]
three_doors = [3 for i in range(0, 65)]
four_doors = [4 for i in range(0, 856)]
doors = five_doors + three_doors + four_doors
doors_shuffled = random.sample(doors, len(doors))

In [None]:
doors_shuffled

## Create fake "Price" data

In [None]:
makes_series = pd.Series(makes)
makes_series.value_counts()

In [None]:
car_sales

In [None]:
car_sales[car_sales["Make"] == "Toyota"]

In [None]:
car_sales[car_sales["Make"] == "Honda"]

In [None]:
car_sales[car_sales["Make"] == "Nissan"]

In [None]:
prices = [random.randint(5000, 30000) for i in range(0, 1000)]
len(prices), prices[:30]

## Create base dataframe with manufactured data

In [None]:
fake_sales = pd.DataFrame(columns = ["Make", "Colour", "Odometer (KM)", "Doors", "Price"])
fake_sales

In [None]:
fake_sales["Make"] = makes
fake_sales["Colour"] = colours_shuffled
fake_sales["Odometer (KM)"] = odometer
fake_sales["Doors"] = doors
fake_sales["Price"] = prices

In [None]:
fake_sales.head()

## Adjust the price column

For the price column:
* Generate random numbers between the certain values
* If the Odometer reading is above 100K, multiply price by 0.75
* If the Odometer reading is above 150K, multiply price by 0.6
* If the Odometer reading is above 200K, multiply price by 0.5
* If the Make column is BMW, multiply price by 1.5 + 2500
* If the Make column is Toyota, multuply price by 1.2
* If the Make is Nissan, multiply price by 1.1
* If the Make is Honda, add $1000 to price

In [None]:
fake_sales["Price"].describe()

In [None]:
def price_od(price, odometer):
    """
    Changes price according to Odometer values.
    """
    if 100000 <= odometer <= 150000:
        return round(price * 0.75)
    elif 150001 <= odometer <= 200000:
        return round(price * 0.6)
    elif 200001 <= odometer:
        return round(price * 0.5)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_od(x["Price"], 
                                                          x["Odometer (KM)"]), 
                                                          axis=1)

fake_sales["Price"].describe()

In [None]:
def price_make(price, make):
    """
    Manipulates the price base on the cars make.
    """
    if make == "BMW":
        return round((price * 1.5) + random.randint(3000, 10000))
    elif make == "Toyota":
        return round(price * 1.2)
    elif make == "Nissan":
        return round(price * 1.1)
    elif make == "Honda":
        return round(price + 1000)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_make(x["Price"], 
                                                            x["Make"]), 
                                                            axis=1)

fake_sales["Price"].describe()

In [None]:
fake_sales = fake_sales.sample(frac=1)

In [None]:
fake_sales.reset_index(drop=True, inplace=True)
fake_sales.head(10)

# NEXT:
* Drop some values at random (to manufacture missing data)
* Build a random forest model to predict (this will involve changing categories to numerical data)

In [None]:
# Export the data
fake_sales.to_csv("../data/car-sales-extended.csv")

## Make missing data in car_sales_extended

In [None]:
sales_ext = pd.read_csv("../data/car-sales-extended.csv")

In [None]:
len(sales_ext)

In [None]:
sales_ext

### What we want to do
* Remove some rows values or replace them at random
    * E.g. replace strings with empty strings ("")
    * And numbers with NaN or something similar...
* Want to keep the number of samples the same, order the same, just put some holes in it

One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies.

In [None]:
# Replicate the df
sales_ext_dropped = sales_ext

In [None]:
# Make column
np.random.seed(10)
make_idx = np.random.randint(0, 1000, 50)

In [None]:
make_idx

In [None]:
for value in make_idx:
    sales_ext_dropped.loc[value, "Make"] = ""

In [None]:
sales_ext_dropped["Make"][266]

In [None]:
# Colour column
np.random.seed(42)
colour_idx = np.random.randint(0, 1000, 50)
for value in colour_idx:
    sales_ext_dropped.loc[value, "Colour"] = ""

In [None]:
# Odometer (KM) column
np.random.seed(1)
odom_idx = np.random.randint(0, 1000, 50)
for value in odom_idx:
    sales_ext_dropped.loc[value, "Odometer (KM)"] = None

In [None]:
# Doors column
np.random.seed(2)
door_idx = np.random.randint(0, 1000, 50)
for value in door_idx:
    sales_ext_dropped.loc[value, "Doors"] = None

In [None]:
# Price column
np.random.seed(3)
price_idx = np.random.randint(0, 1000, 50)
for value in price_idx:
    sales_ext_dropped.loc[value, "Price"] = None

In [None]:
sales_ext_dropped.head(50)

In [None]:
# Check how many of our values are missing/NaN
sales_ext_dropped.isna().sum()

In [None]:
# Export dataframe with random missing values
sales_ext_dropped.to_csv("../data/car-sales-extended-missing-data.csv", index=False)