# Step 1: Import Libraries

In [5]:
# Import Libraries
from pathlib import Path
import pandas as pd

# Step 2: Import the csv's into DataFrames

In [6]:
# identify path to raw data
csvpath_v5 = Path('./raw_data/vehicles_v5.csv')
csvpath_v6 = Path('./raw_data/vehicles_v6.csv')
csvpath_v7 = Path('./raw_data/vehicles_v7.csv')
csvpath_v9 = Path('./raw_data/vehicles_v9.csv')
csvpath_v10 = Path('./raw_data/vehicles_v10.csv')

In [7]:
# load datasets into DataFrames
vehicles_v5_df = pd.read_csv(csvpath_v5)
vehicles_v6_df = pd.read_csv(csvpath_v6)
vehicles_v7_df = pd.read_csv(csvpath_v7)
vehicles_v9_df = pd.read_csv(csvpath_v9)
vehicles_v10_df = pd.read_csv(csvpath_v10)

# Step 3: Prepare Data for Analysis

## Step 3a: Create "**uid**" column for index by parsing "**url**" column

In [None]:
# create 'uid' column based on parse of 'url' column
vehicles_v5_df['uid'] = vehicles_v5_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v6_df['uid'] = vehicles_v6_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v7_df['uid'] = vehicles_v7_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v9_df['uid'] = vehicles_v9_df['url'].apply(lambda x: x.split('.html')[0][-10:])
vehicles_v10_df['uid'] = vehicles_v10_df['url'].apply(lambda x: x.split('.html')[0][-10:])

## Step 3b: Drop columns irrelevant to analysis

In [None]:
# drop irrelevant columns from each version of the dataset

# (oldest) VERSION 5: url/image_url
vehicles_v5_df.drop(columns='url',inplace=True)
vehicles_v5_df.drop(columns='image_url',inplace=True)

# VERSION 6: url/city_url/image_url
vehicles_v6_df.drop(columns='url',inplace=True)
vehicles_v6_df.drop(columns='city_url',inplace=True)
vehicles_v6_df.drop(columns='image_url',inplace=True)

# VERSION 7: url/city_url/image_url
vehicles_v7_df.drop(columns='url',inplace=True)
vehicles_v7_df.drop(columns='city_url',inplace=True)
vehicles_v7_df.drop(columns='image_url',inplace=True)

# VERSION 9: id/url/region_url/image_url/county/state
vehicles_v9_df.drop(columns='id',inplace=True)
vehicles_v9_df.drop(columns='url',inplace=True)
vehicles_v9_df.drop(columns='region_url',inplace=True)
vehicles_v9_df.drop(columns='image_url',inplace=True)
vehicles_v9_df.drop(columns='county',inplace=True)
vehicles_v9_df.drop(columns='state',inplace=True)

# VERSION 10: id/url/region_url/image_url/county/state
vehicles_v10_df.drop(columns='id',inplace=True)
vehicles_v10_df.drop(columns='url',inplace=True)
vehicles_v10_df.drop(columns='region_url',inplace=True)
vehicles_v10_df.drop(columns='image_url',inplace=True)
vehicles_v10_df.drop(columns='county',inplace=True)
vehicles_v10_df.drop(columns='state',inplace=True)

## Step 3c: Rename '**description**' column to '**desc**' in versions 9/10

In [None]:
# rename 'description' to 'desc'
vehicles_v9_df = vehicles_v9_df.rename(columns={'description':'desc'})
vehicles_v10_df = vehicles_v10_df.rename(columns={'description':'desc'})

## Step 3d: Set index to '**uid**' column

In [None]:
# set 'uid' as index
vehicles_v5_df.set_index('uid')
vehicles_v6_df.set_index('uid')
vehicles_v7_df.set_index('uid')
vehicles_v9_df.set_index('uid')
vehicles_v10_df.set_index('uid')

## Step 3e: Verify column names for each version of the dataset

In [79]:
print(f'VERSION 5:\n{vehicles_v5_df.columns.values}')
print(f'VERSION 6:\n{vehicles_v6_df.columns.values}')
print(f'VERSION 7:\n{vehicles_v7_df.columns.values}')
print(f'VERSION 9:\n{vehicles_v9_df.columns.values}')
print(f'VERSION 10:\n{vehicles_v10_df.columns.values}')

VERSION 5:
['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'lat' 'long' 'uid']
VERSION 6:
['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid']
VERSION 7:
['city' 'price' 'year' 'manufacturer' 'make' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'desc' 'lat' 'long' 'uid']
VERSION 9:
['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'description' 'lat' 'long' 'uid']
VERSION 10:
['region' 'price' 'year' 'manufacturer' 'model' 'condition' 'cylinders'
 'fuel' 'odometer' 'title_status' 'transmission' 'VIN' 'drive' 'size'
 'type' 'paint_color' 'description' 'lat' 'long

<span style='color:cyan'>generate timestamp string for each of the datasets that do not already contain one</span>

- ```vehicles_v5_df['posting_date'] = '2018-10-31'```
    - date derived from content description at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/5)

- ```vehicles_v6_df['posting_date'] = '2019-06-09'```
    - date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/6)

- ```vehicles_v7_df['posting_date'] = '2019-07-14'```
    - date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/7)

<span style='color:cyan'>reminder that v8 didn't have any data associated with it's page</span>

- ```vehicles_v9_df['posting_date'] = '2021-04-19'```
    - date taken from version timestamp at [link](https://www.kaggle.com/austinreese/craigslist-carstrucks-data/version/9)

<span style='color:cyan'>reminder that v10 already has a 'posting_date' value for nearly all entries but the time needs to be stripped to include only the date</span>