# **Used Car Price Predictor Notebook**

### import required libraries

In [1]:
import keras 
import pandas as pd
import pandasql as sqldf
from zip_to_state import zip_to_state, state_to_abbrev
import plotly.express as px
import numpy as np
from scipy import stats


## **Phase 1: Data Collection**
#### **Step 1: Extract Training Data from Parquet File**

In [2]:
# time to run: 4m 57s
df = pd.read_csv('used_cars_data.csv')

  df = pd.read_csv('used_cars_data.csv')


# ---------------------------------------------------------------------------

#### **Step 2: Preview the Dataset's Schema**

In [None]:
pd.reset_option('all')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.head(2)
# df.shape
# df.columns


# ---------------------------------------------------------------------------

## **MLOPs Phase 2: Data Ingestion**
#### For the purposes of this tutorial, we will not be ingesting data. However, in an organizational setting, you'll use an ingestion framework, such as Apache Airflow, Apache Spark, or Apache Kafka, to collect data into a Data Lake to create a curated Training Dataset.

## **MLOPs Phase 3: Data Storage **
#### Since we already extracted this data was previously extracted from BigQuery, this phase has already been completed. Typically, you'll use a Cloud Storage Bucket (e.g., AWS S3, Google Cloud Storage) to store all data to be used in ML applications.

# ---------------------------------------------------------------------------

## **MLOps Phase 5: Exploratory Data Analysis (EDA)**

#### **Catagorical Data Col Analysis**

In [None]:
categorical_columns = ['body_type', 'listing_color','has_accidents','is_new','salvage','wheel_system','make_name', 'frame_damaged','fleet']
#'exterior_color'
unique_value_counts = df[categorical_columns].nunique()

value_frequencies = {}
for col in categorical_columns:
    value_frequencies[col] = df[col].value_counts()

print("Count of Unique Values:")
print(unique_value_counts)


In [None]:
# Loop through categorical columns and create histograms
for col, frequencies in value_frequencies.items():
    fig = px.bar(frequencies, x=frequencies.index, y=frequencies.values, labels={'x': col, 'y': 'Frequency'})
    
    # Customize the layout if needed
    fig.update_layout(
        title=f'Histogram of {col}',
        xaxis_title=col,
        yaxis_title='Frequency',
        xaxis={'categoryorder':'total descending'}  # Sort categories by frequency
    )
    
    fig.show()  

#### **Numerical Data Col Analysis**

In [None]:

numerical_columns = [ 
       'daysonmarket','listed_date', 'mileage', 'owner_count', 'price','year'] 

summary_statistics = df[numerical_columns].describe()
rounded_summary_statistics = summary_statistics.round(2)  
print(rounded_summary_statistics)


In [4]:
def zip2State(df):
    df['dealer_zip'] = df['dealer_zip'].astype(str).str[:5]
    df['state'] = pd['dealer_zip'].astype(int).apply(
    lambda zip_code: next((state for (zipcode_range, state) in zip_to_state.items() if zip_code >= zipcode_range[0] and zip_code <= zipcode_range[1]), 'Unknown')
)
    return df

In [None]:

x_range_price = [0, 200000]
x_range_year = [1995, 2022]
x_range_mileage = [0, 100000]

fig_price = px.histogram(df, x='price', title='Price Distribution - Outliers removed', range_x=x_range_price,  nbins=500)
fig_mileage = px.histogram(df, x='mileage', title='Mileage Distribution - Outliers removed', range_x=x_range_mileage,  nbins=32000)
fig_year = px.histogram(df, x='year', title='Year Distribution - Outliers removed', range_x=x_range_year)

state_summary = pd.DataFrame(df['dealer_zip'].astype(str).str[:5])
state_summary['state'] = state_summary['dealer_zip'].astype(int).apply(
    lambda zip_code: next((state for (zipcode_range, state) in zip_to_state.items() if zip_code >= zipcode_range[0] and zip_code <= zipcode_range[1]), 'Unknown')
)
state_summary = state_summary.groupby('state').size().reset_index(name='total_listings')
state_summary['state_abbr'] = state_summary['state'].map(state_to_abbrev)
state_summary = state_summary[state_summary['state_abbr'] != 'Unknown']
max_value = state_summary['total_listings'].max()
fig_map = px.choropleth(
    state_summary,
    locations='state_abbr',  
    locationmode='USA-states',      
    color='total_listings',       
    color_continuous_scale='greens',
    range_color=(0, max_value), 
    scope='usa'                  
)

fig_map.update_geos(fitbounds='locations', visible=False)
fig_map.show()

fig_price.show()
fig_mileage.show()
fig_year.show()


#### **Find the total number of missing values per column**

In [None]:
for col in df.columns:
    # Count rows with NaN as 'mileage'
    nan_mileage_count = df[col].isna().sum()

    # Print the count
    print(f"{col}: {nan_mileage_count}")
    

#### **Find total outliers per col and how many standard deviations out**

In [28]:
df.columns

Index(['dealer_zip', 'exterior_color', 'fleet', 'frame_damaged',
       'has_accidents', 'is_new', 'listing_color', 'make_name', 'mileage',
       'model_name', 'owner_count', 'price', 'salvage', 'trim_name',
       'wheel_system', 'year'],
      dtype='object')

In [None]:
columns_to_be_removed = ["latitude","longitude","city","vin","back_legroom","bed","bed_height","bed_length","body_type","cabin","city_fuel_economy","combine_fuel_economy","daysonmarket","description","engine_cylinders","engine_displacement","engine_type","franchise_dealer","franchise_make","front_legroom","fuel_tank_volume","fuel_type","height","highway_fuel_economy","horsepower","interior_color","isCab","is_certified","is_cpo","is_oemcpo","length","listed_date","listing_id","main_picture_url","maximum_seating","power","savings_amount","seller_rating","sp_id","sp_name","theft_title","torque","transmission","transmission_display","vehicle_damage_category","wheel_system_display","wheelbase","width","trimId", "major_options"]

df_outliers = df.drop(columns=['dealer_zip', 'exterior_color', 'fleet', 'frame_damaged','has_accidents', 'is_new', 'listing_color', 'salvage', 'trim_name',
'wheel_system','owner_count'])
# # Assuming df is your DataFrame with columns "mileage," "owner_count," "price," and "year"
columns_of_interest = [ 'mileage', 'price','year']
# Initialize a dictionary to store outlier counts for each column
outlier_counts = {}

# Set the Z-score threshold for identifying outliers
z_score_threshold = 2  # You can adjust this threshold as needed

for col in columns_of_interest:
    z_scores = np.abs(stats.zscore(df_outliers[col]))
    outliers = (z_scores > z_score_threshold)
    outlier_count = outliers.sum()
    outlier_counts[col] = outlier_count

# Display the counts of outliers for each column
for col, count in outlier_counts.items():
    print(f"{col}: {count} outliers")

# Print the rows that are outliers for the "price" column
outliers_price = df_outliers[outliers["price"]]
print("Rows with 'price' outliers:")

print(outliers_price)

In [5]:
df = zip2State(df)

AttributeError: 'DataFrame' object has no attribute 'DataFrame'

In [3]:
df.columns
columns_to_be_removed = ["latitude","longitude","city","vin","back_legroom","bed","bed_height","bed_length","body_type","cabin","city_fuel_economy","combine_fuel_economy","daysonmarket","description","engine_cylinders","engine_displacement","engine_type","franchise_dealer","franchise_make","front_legroom","fuel_tank_volume","fuel_type","height","highway_fuel_economy","horsepower","interior_color","isCab","is_certified","is_cpo","is_oemcpo","length","listed_date","listing_id","main_picture_url","maximum_seating","power","savings_amount","seller_rating","sp_id","sp_name","theft_title","torque","transmission","transmission_display","vehicle_damage_category","wheel_system_display","wheelbase","width","trimId", "major_options"]
df.drop(columns=columns_to_be_removed, inplace=True)
cols_to_beRemoved2 =['dealer_zip', 'exterior_color', 'fleet', 'frame_damaged','has_accidents', 'is_new', 'salvage', 'trim_name',
'wheel_system','owner_count']
df.drop(columns=cols_to_beRemoved2, inplace=True)



df_encoded = pd.get_dummies(df, columns=['listing_color','make_name', 'model_name'])
corr_matrix = df_encoded.corr()
print(corr_matrix)

# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
# plt.title('Correlation Matrix')
# plt.show()

KeyboardInterrupt: 

#### **Step 2: Create Correlation Matrix**

## **DataPipline (Preparation and Wrangling)**

# ---------------------------------------------------------------------------
#### **Step 1: Use Intuition to Remove Irrelevant Features**

In [11]:
# columns_to_be_removed = ["latitude","longitude","city","vin","back_legroom","bed","bed_height","bed_length","body_type","cabin","city_fuel_economy","combine_fuel_economy","daysonmarket","description","engine_cylinders","engine_displacement","engine_type","franchise_dealer","franchise_make","front_legroom","fuel_tank_volume","fuel_type","height","highway_fuel_economy","horsepower","interior_color","isCab","is_certified","is_cpo","is_oemcpo","length","listed_date","listing_id","main_picture_url","maximum_seating","power","savings_amount","seller_rating","sp_id","sp_name","theft_title","torque","transmission","transmission_display","vehicle_damage_category","wheel_system_display","wheelbase","width",'trimId', 'major_options']
columns_to_be_removed = ["exterior_color"]
df = df.drop(columns=columns_to_be_removed)

In [13]:
df_cleaned.head(2)


Unnamed: 0,dealer_zip,exterior_color,fleet,frame_damaged,has_accidents,is_new,listing_color,make_name,mileage,model_name,owner_count,price,salvage,trim_name,wheel_system,year
0,960,Solar Yellow,,,,True,YELLOW,Jeep,7.0,Renegade,,23141.0,,Latitude FWD,FWD,2019
1,922,Narvik Black,,,,True,BLACK,Land Rover,8.0,Discovery Sport,,46500.0,,S AWD,AWD,2020


#### **Remove outlier from major columns**

#### **Convert all data in each column to target type**

In [None]:

data_types = {
    'fleet': bool,
    'frame_damaged': bool,
    'has_accidents': bool,
    'is_new': bool,
    'mileage': int,
    'owner_count': int,
    'price': int,
    'salvage': bool,
    'year': int
}

# Use the astype method to convert columns to the specified data types
df = df.astype(data_types)

# If you want to convert 'listing_color', 'make_name', 'model_name',  'trim_name', and 'wheel_system' columns to string
string_columns = ['listing_color', 'make_name', 'model_name', 'trim_name', 'wheel_system']
df[string_columns] = df[string_columns].astype(str)


#### **get rid of redundant year values in trim name col**

In [None]:

df_cleaned['trim_name'] = df_cleaned['trim_name'].str.replace(r'20\d{2}\.5 ', '', regex=True)

#### **Simple data imputation to fill all the missing mileage values to the average mileage of vehicles of theat same year**

In [None]:
# Calculate the average mileage for each year
average_mileage_by_year = df.groupby('year')['mileage'].transform('mean')

# Fill NaN values in 'mileage' with the corresponding average for the year
df['mileage'].fillna(average_mileage_by_year, inplace=True)

# Print the updated DataFrame
print(df)

# ---------------------------------------------------------------------------

## **MLOps Phase 6: Feature Engineering**

#### turn the zip code into a US state field and remove all NONE rows**

In [None]:


pd.options.mode.chained_assignment = None

df_cleaned['state'] = 'Not Found'
# Turn the zipCode Column into a new state column
for index, row in df_cleaned.iterrows():
    zip_code = int(str(row['dealer_zip'])[:5]) 
    for (start, end), state_name in zip_to_state_mapping.items():
        if start <= zip_code <= end:
            df_cleaned.loc[index, 'state'] = state_name
            break

# Remove NULL or Unknown states
undesired_values = ['Not Found']
mask = ~df_cleaned['state'].isin(undesired_values)
df_cleaned = df_cleaned[mask]
df_cleaned = df_cleaned.reset_index(drop=True)

unique_values = df_cleaned['state'].unique()
print(len(unique_values))

In [None]:
df_cleaned = df_cleaned.drop(columns=['dealer_zip'])

#### **Step 3: Create DataFrame that Merges Pre-Preprocessed Data and Preprocessed Data**

# ---------------------------------------------------------------------------

## **MLOps Phase 7: Model Development**

#### **Step 1: Identify Features and Label**

#### **Step 2: Split Training Data into Training and Evaluation Subsets**

#### **Step 3: Fit Training Subset to Model**

#### **Step 4: Show Feature Importance Metrics**