# Step 1: Import Libraries

In [1]:
# Import Libraries
from pathlib import Path
import pandas as pd

# Step 2: Import the csv's into DataFrames

In [2]:
# identify path to raw data
csvpath_v5 = Path('./raw_data/vehicles_v5.csv')
csvpath_v6 = Path('./raw_data/vehicles_v6.csv')
csvpath_v7 = Path('./raw_data/vehicles_v7.csv')
csvpath_v9 = Path('./raw_data/vehicles_v9.csv')
csvpath_v10 = Path('./raw_data/vehicles_v10.csv')

In [3]:
# load datasets into DataFrames
vehicles_v5_df = pd.read_csv(csvpath_v5)
vehicles_v6_df = pd.read_csv(csvpath_v6)
vehicles_v7_df = pd.read_csv(csvpath_v7)
vehicles_v9_df = pd.read_csv(csvpath_v9)
vehicles_v10_df = pd.read_csv(csvpath_v10)

# Step 3: Prepare Data for Analysis

## Step 3a: Create "**uid**" column for index by parsing "**url**" column

In [4]:
# create 'uid' column based on parse of 'url' column
vehicles_v5_df['uid'] = vehicles_v5_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v6_df['uid'] = vehicles_v6_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v7_df['uid'] = vehicles_v7_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v9_df['uid'] = vehicles_v9_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v10_df['uid'] = vehicles_v10_df['url'].apply(lambda x: x.split('.html')[0][-10:])

## Step 3b: Drop columns irrelevant to analysis

In [5]:
# drop irrelevant columns from each version of the dataset

# (oldest) VERSION 5: url/image_url
vehicles_v5_df.drop(columns='url',inplace=True)
vehicles_v5_df.drop(columns='image_url',inplace=True)

# VERSION 6: url/city_url/image_url
vehicles_v6_df.drop(columns='url',inplace=True)
vehicles_v6_df.drop(columns='city_url',inplace=True)
vehicles_v6_df.drop(columns='image_url',inplace=True)

# VERSION 7: url/city_url/image_url
vehicles_v7_df.drop(columns='url',inplace=True)
vehicles_v7_df.drop(columns='city_url',inplace=True)
vehicles_v7_df.drop(columns='image_url',inplace=True)

# VERSION 9: id/url/region_url/image_url/county/state
vehicles_v9_df.drop(columns='id',inplace=True)
vehicles_v9_df.drop(columns='url',inplace=True)
vehicles_v9_df.drop(columns='region_url',inplace=True)
vehicles_v9_df.drop(columns='image_url',inplace=True)
vehicles_v9_df.drop(columns='county',inplace=True)
vehicles_v9_df.drop(columns='state',inplace=True)

# VERSION 10: id/url/region_url/image_url/county/state
vehicles_v10_df.drop(columns='id',inplace=True)
vehicles_v10_df.drop(columns='url',inplace=True)
vehicles_v10_df.drop(columns='region_url',inplace=True)
vehicles_v10_df.drop(columns='image_url',inplace=True)
vehicles_v10_df.drop(columns='county',inplace=True)
vehicles_v10_df.drop(columns='state',inplace=True)

## Step 3c: Rename '**description**' column to '**desc**' in versions 9/10

In [6]:
# rename 'description' to 'desc'
vehicles_v9_df = vehicles_v9_df.rename(columns={'description':'desc'})
vehicles_v10_df = vehicles_v10_df.rename(columns={'description':'desc'})

## Step 3d: Set index to '**uid**' column

In [7]:
# set 'uid' as index
vehicles_v5_df.set_index('uid')
vehicles_v6_df.set_index('uid')
vehicles_v7_df.set_index('uid')
vehicles_v9_df.set_index('uid')
vehicles_v10_df.set_index('uid')

Unnamed: 0_level_0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,desc,lat,long,posting_date
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7222695916,prescott,6000,,,,,,,,,,,,,,,,,,
7218891961,fayetteville,11900,,,,,,,,,,,,,,,,,,
7221797935,florida keys,21000,,,,,,,,,,,,,,,,,,
7222270760,worcester / central MA,1500,,,,,,,,,,,,,,,,,,
7210384030,greensboro,4900,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7301591192,wyoming,23590,2019.0,nissan,maxima s sedan 4d,good,6 cylinders,gas,32226.0,clean,other,1N4AA6AV6KC367801,fwd,,sedan,,Carvana is the safer way to buy a car During t...,33.786500,-84.445400,2021-04-04T03:21:31-0600
7301591187,wyoming,30590,2020.0,volvo,s60 t5 momentum sedan 4d,good,,gas,12029.0,clean,other,7JR102FKXLG042696,fwd,,sedan,red,Carvana is the safer way to buy a car During t...,33.786500,-84.445400,2021-04-04T03:21:29-0600
7301591147,wyoming,34990,2020.0,cadillac,xt4 sport suv 4d,good,,diesel,4174.0,clean,other,1GYFZFR46LF088296,,,hatchback,white,Carvana is the safer way to buy a car During t...,33.779214,-84.411811,2021-04-04T03:21:17-0600
7301591140,wyoming,28990,2018.0,lexus,es 350 sedan 4d,good,6 cylinders,gas,30112.0,clean,other,58ABK1GG4JU103853,fwd,,sedan,silver,Carvana is the safer way to buy a car During t...,33.786500,-84.445400,2021-04-04T03:21:11-0600


# Step 4: Normalize columns across versions of the dataset

## Step 4a: take stock of the current sizes of the versions of the dataset

In [13]:
print(f'VERSION 5:\t{len(vehicles_v5_df.columns.values)} columns\n\n{vehicles_v5_df.columns.values}\n')
print(f'VERSION 6:\t{len(vehicles_v6_df.columns.values)} columns\n\n{vehicles_v6_df.columns.values}\n')
print(f'VERSION 7:\t{len(vehicles_v7_df.columns.values)} columns\n\n{vehicles_v7_df.columns.values}\n')
print(f'VERSION 9:\t{len(vehicles_v9_df.columns.values)} columns\n\n{vehicles_v9_df.columns.values}\n')
print(f'VERSION 10:\t{len(vehicles_v10_df.columns.values)} columns\n\n{vehicles_v10_df.columns.values}\n')

VERSION 5:	19 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'lat' 'long' 'uid']

VERSION 6:	20 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid']

VERSION 7:	20 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid']

VERSION 9:	20 columns

['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid']

VERSION 10:	21 columns

['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'driv

## Step 4b: Add missing columns to versions of the dataset

In [16]:
# add timestamp from data versions were scraped

# data was scraped in October of 2018, unlike other dates was posted online November 21st 2018;
# assumption made to be last day of the month in October to cover entire month,
# but still listed to be October since only post stating date in content description
vehicles_v5_df['posting_date'] = '2018-10-31'
vehicles_v5_df['desc'] = None
vehicles_v6_df['posting_date'] = '2019-06-09'
vehicles_v7_df['posting_date'] = '2019-07-14'
vehicles_v9_df['posting_date'] = '2021-04-19'

<span style='color:cyan'>generate timestamp string for each of the datasets that do not already contain one</span>

- date derived from content description at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/5)

- date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/6)

- date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/7)

<span style='color:cyan'>reminder that v8 didn't have any data associated with it's page</span>

- date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/9)

<span style='color:cyan'>reminder that v10 already has a 'posting_date' value for nearly all entries but the time needs to be stripped to include only the date</span>

In [17]:
print(f'VERSION 5:\t{len(vehicles_v5_df.columns.values)} columns\n\n{vehicles_v5_df.columns.values}\n')
print(f'VERSION 6:\t{len(vehicles_v6_df.columns.values)} columns\n\n{vehicles_v6_df.columns.values}\n')
print(f'VERSION 7:\t{len(vehicles_v7_df.columns.values)} columns\n\n{vehicles_v7_df.columns.values}\n')
print(f'VERSION 9:\t{len(vehicles_v9_df.columns.values)} columns\n\n{vehicles_v9_df.columns.values}\n')
print(f'VERSION 10:\t{len(vehicles_v10_df.columns.values)} columns\n\n{vehicles_v10_df.columns.values}\n')

VERSION 5:	21 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'lat' 'long' 'uid' 'posting_date' 'desc']

VERSION 6:	21 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid' 'posting_date']

VERSION 7:	21 columns

['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid' 'posting_date']

VERSION 9:	21 columns

['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid' 'posting_date']

VERSION 10:	21 columns

['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cyli