# Data Cleaning

1. Import Data

Data Cleaning Steps:
- export raw_combined_dataset
- upload for team access
- create filtered_combined_dataset
    - identify unique id column(s?) ['id','url']
    - add new from v6 to v5 with sql-join
    - remove from v5 what is not in v6 ; for every row in v5 ...
- export filtered_combined_dataset to be the "work_data" (newfile)

In [1]:
# Import Libraries
from pathlib import Path
import pandas as pd

In [2]:
# identify path to raw data
csvpath_v10 = Path('./raw_data/vehicles_v10.csv')
csvpath_v9 = Path('./raw_data/vehicles_v9.csv')
csvpath_v7 = Path('./raw_data/vehicles_v7.csv')
csvpath_v6 = Path('./raw_data/vehicles_v6.csv')
csvpath_v5 = Path('./raw_data/vehicles_v5.csv')

In [3]:
# load datasets into DataFrames
vehicles_v10_df = pd.read_csv(csvpath_v10)
vehicles_v9_df = pd.read_csv(csvpath_v9)
vehicles_v7_df = pd.read_csv(csvpath_v7)
vehicles_v6_df = pd.read_csv(csvpath_v6)
vehicles_v5_df = pd.read_csv(csvpath_v5)

In [4]:
added_names_list = []       # list of the added column names
combined_names_list = []    # list of all of the total column names

In [5]:
original_column_names = vehicles_v5_df.columns.values
combined_names_list.extend(vehicles_v5_df.columns.values)

In [6]:
# list of lists containing all subsequent versions column names
subsequent_columns_lists = [
    vehicles_v6_df.columns.values,
    vehicles_v7_df.columns.values,
    vehicles_v9_df.columns.values,
    vehicles_v10_df.columns.values
]

In [7]:
# generate list of column name changes
for list in subsequent_columns_lists:
    for name in list:
        if name not in combined_names_list:
            added_names_list.append(name)
            combined_names_list.append(name)

In [8]:
# sort lists of column name values
original_column_names.sort()
added_names_list.sort()
combined_names_list.sort()

In [9]:
# print results
print(f'ORIGINAL LIST:\n{original_column_names}')
print('----------------------------------------------------------------------------------------------------')
print(f'ADDED COLUMNS LIST:\n{added_names_list}')
print('----------------------------------------------------------------------------------------------------')
print(f'FINAL LIST:\n{combined_names_list}')


ORIGINAL LIST:
['VIN' 'city' 'condition' 'cylinders' 'drive' 'fuel' 'image_url' 'lat'
 'long' 'make' 'manufacturer' 'odometer' 'paint_color' 'price' 'size'
 'title_status' 'transmission' 'type' 'url' 'year']
----------------------------------------------------------------------------------------------------
ADDED COLUMNS LIST:
['city_url', 'county', 'desc', 'description', 'id', 'model', 'posting_date', 'region', 'region_url', 'state']
----------------------------------------------------------------------------------------------------
FINAL LIST:
['VIN', 'city', 'city_url', 'condition', 'county', 'cylinders', 'desc', 'description', 'drive', 'fuel', 'id', 'image_url', 'lat', 'long', 'make', 'manufacturer', 'model', 'odometer', 'paint_color', 'posting_date', 'price', 'region', 'region_url', 'size', 'state', 'title_status', 'transmission', 'type', 'url', 'year']


In [10]:
# TODO: START HERE --- DOUBLE CHECK METHOD FOR CHECKING FOR DUPLICATES
# check for duplicated values in the "url" column
display(vehicles_v5_df.loc[:,'url'].duplicated().sum())
display(vehicles_v6_df.loc[:,'url'].duplicated().sum())
display(vehicles_v7_df.loc[:,'url'].duplicated().sum())
display(vehicles_v9_df.loc[:,'url'].duplicated().sum())
display(vehicles_v10_df.loc[:,'url'].duplicated().sum())

print(vehicles_v5_df.loc[:,'url'])

542860

0

0

0

0

0         37.132840
1         35.777999
2         36.333900
3         36.000092
4         36.272932
            ...    
677807    61.288605
677808    61.592300
677809    61.190000
677810    61.630300
677811    61.546300
Name: url, Length: 677812, dtype: float64


In [11]:
# concatenate all url's into a single list to verify appropriate unique identifier
combined_url_df = pd.DataFrame()
print(combined_url_df.dtypes)
combined_url_df.append(vehicles_v5_df.loc[:,'url'], ignore_index= True)
print(f'length: {len(combined_url_df.index)}')
combined_url_df.append(vehicles_v6_df.loc[:,'url'], ignore_index= True)
print(f'length: {len(combined_url_df.index)}')
combined_url_df.append(vehicles_v7_df.loc[:,'url'], ignore_index= True)
print(f'length: {len(combined_url_df.index)}')
combined_url_df.append(vehicles_v9_df.loc[:,'url'], ignore_index= True)
print(f'length: {len(combined_url_df.index)}')
combined_url_df.append(vehicles_v10_df.loc[:,'url'], ignore_index= True)
print(f'length: {len(combined_url_df.index)}')
print(combined_url_df.dtypes)

Series([], dtype: object)
length: 0
length: 0
length: 0
length: 0
length: 0
Series([], dtype: object)


In [12]:
# list column names of the DataFrame
'''
display('VERSION 5')
display(vehicles_v5_df.columns)

display('VERSION 6')
display(vehicles_v6_df.columns)

display('VERSION 7')
display(vehicles_v7_df.columns)

display('VERSION 9')
display(vehicles_v9_df.columns)

display('VERSION 10:')
display(vehicles_v10_df.columns)
'''

original_columns_list = list(vehicles_v5_df.columns)

subsequent_columns_dict = {
    'v6': vehicles_v6_df.columns.values,
    'v7': vehicles_v7_df.columns.values,
    'v9': vehicles_v9_df.columns.values,
    'v10': vehicles_v10_df.columns.values
    }



TypeError: 'numpy.ndarray' object is not callable

In [None]:
combined_columns_list = []  # create list to hold final list of column names for iterative comparison
version_additions = []      # list of names to transfer to log in the changes list
changes_list = []           # list of lists to hold changes associated with each progressive version

In [None]:
# add original column names to combined list
combined_columns_list.extend(original_columns_list)

In [None]:
"""


'''
for each column name in each version, determine if it is an addition over the previous version
and make note if it is a new column name
'''

# add each column name not already appearing in a previous version
# and associate with respective version entry
for version, column_names in subsequent_columns_dict.items():
    for name in column_names:
        # print(f'{version}: {name}')
        if name not in combined_columns_list:
            # print(f'current iteration:\n\tlist: {version}\n\tvalue: {name}')
            version_additions.append(name)
            combined_columns_list.append(name)
    changes_list.append([version,version_additions])

# print(f'ORIGINAL LIST:\n{original_columns_list}')
for entry in changes_list:
    print(f'Changes in VERSION: {entry[0]}\n{entry[1]}')
"""

Changes in VERSION: v6
[]
Changes in VERSION: v7
[]
Changes in VERSION: v9
[]
Changes in VERSION: v10
[]
Changes in VERSION: v6
[]
Changes in VERSION: v7
[]
Changes in VERSION: v9
[]
Changes in VERSION: v10
[]
Changes in VERSION: v6
[]
Changes in VERSION: v7
[]
Changes in VERSION: v9
[]
Changes in VERSION: v10
[]
Changes in VERSION: v6
[]
Changes in VERSION: v7
[]
Changes in VERSION: v9
[]
Changes in VERSION: v10
[]


In [None]:
# drop listings not containing posting_date/lat/long values
# data sourcing, data review, and set up final working files

# User Stories:
- What should the project be about?
    - first requirement (first 5 or so)
- Data Analysis:
    - what do we want to show?
- Data Presentation:
    - what viz do we want to see?
        - together we decide--

- also still need to pull data on interest rates/lending rates (if possible)
    - need to know whether or not it's possible

Acceptance Criteria:
    - Used cars in West LA Example

Specifically have only 4 or 5 high lievel items to present: Where are the opportunities?
    should we be excited in the used car sales market in x place, yes or no??
    Something of value has to come out of it
