In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from scipy.stats import linregress

In [2]:
# Load CSV files into new DataFrame
data = "Resources/used_car_sales.csv"
zipcode_data = "Resources/Zip_Locale_Detail.csv"

# Read CSV file and store in Pandas DataFrame
df = pd.read_csv(data)
zip_df = pd.read_csv(zipcode_data)

# Show df
df.head()

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
0,137178,7500,2020,786**,84430,Ford,Mustang,1988,LX,5.0L Gas V8,Sedan,0,RWD
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,07852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,07728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD


In [3]:
# Create Cleaned_df
cleaned_df = df.copy()

# Remove rows that contain non-numeric characters in zip code column
cleaned_df = cleaned_df[pd.to_numeric(cleaned_df['zipcode'], errors='coerce').notna()]

# Change dtype to 'int64' to match zip_df
cleaned_df['zipcode'] = cleaned_df['zipcode'].astype('int64')

# Ensures only zip codes matching a value in zip_df remain in DataFrame
acceptable_zips = zip_df["DELIVERY ZIPCODE"].astype(int).tolist()
cleaned_df = cleaned_df[cleaned_df['zipcode'].isin(acceptable_zips)]

# Display DataFrame
cleaned_df.head()

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD


In [4]:
# Filter out unusually low sales prices (Anything under $100)
cleaned_df.drop(cleaned_df[cleaned_df['pricesold'] < 100].index, inplace = True)

# Display DataFrame
cleaned_df

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122139,14948,4200,2019,80233,102700,Ford,Mustang,1977,,302,Fastback,8,
122140,58814,6500,2019,53132,128000,Ford,E-Series Van,2012,,E-150,,0,
122141,2156,2000,2019,77536,50000,Ford,Bronco,1978,,351m,,8,4WD
122142,29096,2280,2019,92131,164337,BMW,3-Series,2000,328ci,M52TU 2.8L,Coupe,6,RWD


In [5]:
# Filter out unusually high mileage vehicles (Anything over 400,000)
cleaned_df.drop(cleaned_df[cleaned_df['Mileage'] > 400000].index, inplace = True)

# Display DataFrame
cleaned_df

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122139,14948,4200,2019,80233,102700,Ford,Mustang,1977,,302,Fastback,8,
122140,58814,6500,2019,53132,128000,Ford,E-Series Van,2012,,E-150,,0,
122141,2156,2000,2019,77536,50000,Ford,Bronco,1978,,351m,,8,4WD
122142,29096,2280,2019,92131,164337,BMW,3-Series,2000,328ci,M52TU 2.8L,Coupe,6,RWD


In [6]:
# Filter out unusually high cylinder count vehicles (Anything over 16)
cleaned_df.drop(cleaned_df[cleaned_df['NumCylinders'] > 16].index, inplace = True)

# Display DataFrame
cleaned_df

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122139,14948,4200,2019,80233,102700,Ford,Mustang,1977,,302,Fastback,8,
122140,58814,6500,2019,53132,128000,Ford,E-Series Van,2012,,E-150,,0,
122141,2156,2000,2019,77536,50000,Ford,Bronco,1978,,351m,,8,4WD
122142,29096,2280,2019,92131,164337,BMW,3-Series,2000,328ci,M52TU 2.8L,Coupe,6,RWD


In [7]:
# Filter DataFrame to only include the most frequently sold Makes
Make = ['Ford','Chevrolet','Toyota','Mercedes-Benz','Dodge','BMW','Jeep','Cadillac','Volkswagen','Honda','Pontiac','GMC','Nissan','Porsche','Lincoln','Buick','Audi','Chrysler','Subaru','Lexus','Jaguar','Land Rover']

cleaned_df = cleaned_df.loc[cleaned_df['Make'].isin(Make)]

# Display DataFrame
cleaned_df

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
8,29023,1330,2019,7043,167000,Honda,Civic,2001,EX,,Coupe,4,FWD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122139,14948,4200,2019,80233,102700,Ford,Mustang,1977,,302,Fastback,8,
122140,58814,6500,2019,53132,128000,Ford,E-Series Van,2012,,E-150,,0,
122141,2156,2000,2019,77536,50000,Ford,Bronco,1978,,351m,,8,4WD
122142,29096,2280,2019,92131,164337,BMW,3-Series,2000,328ci,M52TU 2.8L,Coupe,6,RWD


In [8]:
cleaned_df.Make.value_counts()

Ford             14429
Chevrolet        13638
Toyota            4432
Mercedes-Benz     4172
Dodge             3898
BMW               3564
Jeep              3058
Cadillac          2462
Volkswagen        2390
Honda             2371
Pontiac           1738
GMC               1681
Nissan            1661
Porsche           1476
Lincoln           1269
Buick             1187
Audi              1029
Chrysler          1015
Subaru             917
Lexus              894
Jaguar             851
Land Rover         828
Name: Make, dtype: int64