In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
import plotly.express as px
from pandasql import sqldf

In [None]:
credentials = service_account.Credentials.from_service_account_file(r"C:\Users\Chase\Downloads\used-car-summer-2023-project-b4807c4731d7.json")

bigquery_client = bigquery.Client(credentials=credentials, project='used-car-summer-2023-project')

### **EDA Results**
##### 1. Exclude All Listings Over $50,000
##### 2. Exclude All Listings Before 2010
##### 3. Exclude All Listings with Mileage Over 130,000
##### 4. Exclude the Following Makes: Rivian, Polestar, Saab, Freightliner, Saturn, Suzuki, smart, Mercury, Lamborghini, McLaren, Ferrari, Pontiac, Rolls-Royce, HUMMER, Bentley, Maserati, FIAT, Genesis, Scion, Jaguar
##### 5. Exclude All Listings with the Following Exterior Colors: Beige, Teal, Copper, Maroon, Yellow, Purple
##### 6. Exclude All Listings with the Following Interior Colors: Pink, Purple, Yellow, Green, Orange, Blue
##### 7. Exclude All Models with Less Than 50 Occurrences
##### 8. Exclude All Listings with More Than 3 Accidents
##### 9. Exclude All Listings with More Than 5 Owners

In [None]:
query = bigquery_client.query('SELECT * FROM `training_data.wrangled_training_data`').result()

wrangled_data = query.to_dataframe()
wrangled_data

In [None]:
wrangled_data['year'] = wrangled_data['year'].astype(int)
wrangled_data['price'] = wrangled_data['price'].astype(int)

In [None]:
# Exclude all listings with a price over 50000
staged_data = wrangled_data.query('price < 49000 & price >= 8000')
# Exclude all listings with a model_year before 2010
staged_data = staged_data.query('year >= 2011 & year != 2024')
# Exclude all listings with a mileage over 130000
staged_data = staged_data.query('mileage < 128000')
# Exclude all listings with an unqualified exterior_color
staged_data = staged_data[~staged_data['exterior_color'].isin(['Beige', 'Teal', 'Copper', 'Maroon', 'Yellow', 'Purple'])]
# Exclude all listings with an unqualified interior_color
staged_data['interior_color'] = staged_data['interior_color'].str.strip()
staged_data = staged_data[~staged_data['interior_color'].isin(['Pink', 'Purple', 'Yellow', 'Green', 'Orange', 'Blue'])]
# Exclude all listings with num_accidents over 3
staged_data = staged_data.query('num_accidents <= 3')
# Exclude all listings with num_owners over 5
staged_data = staged_data.query('num_owners <= 5')
staged_data = sqldf("SELECT * FROM staged_data WHERE num_owners = 0 AND year >= 2021 OR num_owners > 0")
staged_data.reset_index(drop=True, inplace=True)
staged_data

In [None]:
unqualified_models = sqldf("SELECT make || ' ' || model AS model, COUNT(*) AS num_listings FROM wrangled_data GROUP BY model HAVING num_listings < 50")
staged_data['merged'] = staged_data['make'] + ' ' + staged_data['model']
staged_data = sqldf("SELECT * FROM staged_data WHERE merged NOT IN(SELECT DISTINCT model FROM unqualified_models)")
staged_data

## **Post-Processing EDA**

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
staged_data.describe()

In [None]:
fig = px.histogram(staged_data['price'], nbins=50)
fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), title='Price Distribution')

In [None]:
fig = px.histogram(staged_data['year'])
fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), title='Model Year Distribution')

In [None]:
fig = px.histogram(staged_data['mileage'], nbins=100)
fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), title='Mileage Distribution')

In [None]:
listings_by_make = sqldf("SELECT make, COUNT(*) num_listings FROM staged_data GROUP BY make")

fig = px.bar(listings_by_make, listings_by_make['make'], listings_by_make['num_listings'], text_auto=True, title="Listings by Make", height=550)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
listings_by_ext = sqldf("SELECT exterior_color, COUNT(*) num_listings FROM staged_data GROUP BY exterior_color")

fig = px.bar(listings_by_ext, listings_by_ext['exterior_color'], listings_by_ext['num_listings'], text_auto=True, title="Listings by Exterior Color", height=550)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
listings_by_int = sqldf("SELECT interior_color, COUNT(*) num_listings FROM staged_data GROUP BY interior_color")

fig = px.bar(listings_by_int, listings_by_int['interior_color'], listings_by_int['num_listings'], text_auto=True, title="Listings by Interior Color", height=550)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
listings_by_accidents = sqldf("SELECT num_accidents, COUNT(*) num_listings FROM staged_data GROUP BY num_accidents")

fig = px.bar(listings_by_accidents, listings_by_accidents['num_accidents'], listings_by_accidents['num_listings'], text_auto=True, title="Listings by # of Accidents", height=550)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
listings_by_owners = sqldf("SELECT num_owners, COUNT(*) num_listings FROM staged_data GROUP BY num_owners")

fig = px.bar(listings_by_owners, listings_by_owners['num_owners'], listings_by_owners['num_listings'], text_auto=True, title="Listings by # of Owners", height=550)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
import pandas_gbq
wrangled_listings = staged_data
pandas_gbq.to_gbq(wrangled_listings, 'used-car-summer-2023-project.training_data.final_training_data', project_id='used-car-summer-2023-project', if_exists='replace')

In [35]:

import numpy as np
query = bigquery_client.query("SELECT DISTINCT * FROM `training_data.encoded_training_data` WHERE make NOT IN('INFINITI', 'Genesis', 'MINI', 'Mercury', 'Pontiac', 'Tesla')")
training_data = query.to_dataframe()
training_data['model'] = np.where((training_data['make'] == 'BMW') & (training_data['model'] == '2'), '2-Series', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'BMW') & (training_data['model'] == '3'), '3-Series', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'BMW') & (training_data['model'] == '4'), '4-Series', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'BMW') & (training_data['model'] == '5'), '5-Series', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'BMW') & (training_data['model'] == '7'), '7-Series', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Chrysler') & (training_data['model'] == 'Town'), 'Town-and-Country', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Dodge') & (training_data['model'] == 'Grand'), 'Grand-Caravan', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Ford') & (training_data['model'] == 'Super'), 'Super-Duty', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Hyundai') & (training_data['model'] == 'Santa'), 'Santa-Fe', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Jeep') & (training_data['model'] == 'Grand'), 'Grand-Cherokee', training_data['model'])
training_data['model'] = np.where((training_data['make'] == 'Mitsubishi') & (training_data['model'] == 'Eclipse'), 'Eclipse-Cross', training_data['model'])
training_data['city'] = training_data['city'].str.replace('.','')
training_data.to_parquet('training_data.parquet', compression='snappy')

Unnamed: 0,price,year,make,make_encoded,model,model_encoded,trim,trim_encoded,mileage,exterior_color,...,interior_color,interior_color_encoded,num_accidents,num_owners,usage_type,usage_type_encoded,city,city_encoded,state,state_encoded
0,10998,2012,Ford,9,Focus,75,S Sedan,1286,123021,White,...,Black,2,0,1.0,Personal,2,Augusta,25,GA,10
1,31993,2012,Ford,9,Super-Duty,180,"XL Crew Cab 156""",1841,96642,Green,...,Unknown,6,0,3.0,Personal,2,Indianapolis,247,IN,15
2,39998,2015,GMC,10,Yukon,214,XL Denali,1844,72079,Red,...,Brown,3,0,2.0,Personal,2,Puyallup,426,WA,46
3,12998,2015,Ford,9,Focus,75,SE Hatchback,1333,111934,Gray,...,Black,2,0,4.0,Personal,2,Richmond,435,TX,42
4,29299,2016,Porsche,23,Cayenne,41,,1,78889,Unknown,...,Beige,1,1,3.0,Personal,2,Boise,51,ID,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107861,36998,2019,Volvo,29,XC90,211,T6 R-Design,1644,56317,Silver,...,Black,2,1,1.0,Personal,2,South Jordan,488,UT,43
107862,42998,2022,Volvo,29,XC90,211,T5 Momentum 7-Passenger,1626,43334,White,...,Unknown,6,0,1.0,Personal,2,Nashville,362,TN,41
107863,24991,2016,Volvo,29,XC90,211,T6 Momentum,1640,60892,White,...,Brown,3,0,2.0,Personal,2,Fort Worth,174,TX,42
107864,36998,2020,Volvo,29,XC90,211,T6 Momentum 7 Passenger,1642,48651,Black,...,Black,2,1,1.0,Fleet,1,Ellicott City,153,MD,20
