# ICBC 2023 Lower Mainland Dataset - EDA and data cleaning 2

In [17]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [18]:
 # Check if file path is valid.
file_path = './icbc/Vehicle Population - 2023 Passenger Vehicles_Makes_cleaned.csv'

if not os.path.isfile(file_path):
    fnf_err = f'{file_path} not found.'
    raise FileNotFoundError(fnf_err)

vehicle_df = pd.read_csv(file_path)
# Take a peek at the first couple of rows in the dataset.
print(vehicle_df.head(2))

dataset_columns = vehicle_df.columns.tolist()
print(f"This dataset contains the following columns {dataset_columns}")

num_rows, num_cols = vehicle_df.shape
print(f"Data set shape {num_rows} rows x {num_cols} cols")

  Vehicle Use Anti Theft Device Indicator            Body Style  \
0    Business                          No         Fourdoorsedan   
1    Business                          No  Fourdoorstationwagon   

  Electric_Vehicle_Indicator Fleet Vehicle Indicator Fuel Type  \
0                         No                      No    Diesel   
1                         No                      No    Diesel   

  Hybrid Vehicle Indicator        Make                Model  Model Year  \
0                       No  VOLKSWAGEN  RABBIT OTHER MODELS        1978   
1                       No      TOYOTA   LAND CRUISER WAGON        1991   

  Municipality             Owner Type  Vehicle Count  
0      Langley  External organization              1  
1      Burnaby  External organization              1  
This dataset contains the following columns ['Vehicle Use', 'Anti Theft Device Indicator', 'Body Style', 'Electric_Vehicle_Indicator', 'Fleet Vehicle Indicator', 'Fuel Type', 'Hybrid Vehicle Indicator', 'Make

## Exploratory Data Analysis
- Comment on the number of rows and the columns.

In [19]:
model_years = vehicle_df['Model Year'].unique()
oldest, newest = min(model_years), max(model_years)
print(f"oldest and newest are {oldest}, {newest}")
all_years = [_ for _ in range(oldest, newest+1)]
print(f"Model years = {model_years}")
missing_years = [_ for _ in all_years if _ not in all_years]
print(f"Missing years = {missing_years}")

oldest and newest are 1908, 2024
Model years = [1978 1991 1997 1998 2006 2011 2005 1993 1994 2015 1964 1996 1999 2001
 2003 2004 2007 2009 2010 2012 2014 2018 1995 2000 2002 2016 2017 1990
 1992 1966 1972 1981 1930 1965 1968 1986 1963 1969 1976 1980 1988 1970
 1971 1967 2013 2021 2019 2024 2023 2022 2020 2008 1989 1987 1935 1959
 1962 1979 1955 1973 1983 1950 1951 1982 1984 1985 1949 1952 1953 1956
 1957 1958 1960 1928 1954 1974 1923 1947 1977 1929 1936 1933 1932 1937
 1938 1939 1946 1948 1942 1961 1926 1975 1931 1934 1941 1927 1915 1940
 1924 1909 1913 1908 1912 1920 1916 1925 1921 1944 1911 1943 1910 1918
 1917 1922 1945 1914]
Missing years = []


## Filtering the dataframe for Model year
- We have 719144 number of rows. The oldest vehicles are from model year 1908.
- For our analysis, we are interested in vehicles which are daily driven the most.
- These are vehicles which are relatively not too old, mostly less than 20 years old.
- We're in 2025, so we'll only consider vehicle years from 2005 to 2024.

In [20]:
years_limits = (2005, 2024)
years_df = vehicle_df.loc[(vehicle_df['Model Year'] >= years_limits[0]) & (vehicle_df['Model Year'] <= years_limits[1])]

num_rows, num_cols = years_df.shape
print(f"Data set shape {num_rows} rows x {num_cols} cols")

Data set shape 598987 rows x 13 cols


## Filtering the dataframe for Body Style
- Specialty vehicles aren't driven on a daily basis on public roads.
- We therefore can exclude snowmobiles and similar vehicles.

In [21]:
bodystyles = years_df['Body Style'].unique()
print(f"Unique body styles = {bodystyles}")

Unique body styles = ['Fourdoorstationwagon' 'Twodoorfastback' 'Fourdoorcoupe'
 'Fourdoorfastback' 'Fourdoorsedan' 'Snowmobile' 'Twodoorconvertible'
 'Twodoorstationwagon' 'Lowspeedvehicle' 'Dualpurpose' 'Fourdoorhardtop'
 'Hatchback' 'Twodoorcoupe' 'Golfcart' 'Wheeledatv' 'Twodoorsedan'
 'Limousinepassenger' 'Sportconvertible' 'Threewheeled' 'Twodoorhardtop'
 'Workutilitypassengervehicle' 'Fourdoorconvertible' 'Dunebuggy'
 'Amphibiousvehicle']


### filter out
- Wheeledatv
- Lowspeedvehicle
- Snowmobile
- Dualpurpose
- Golfcart
- Dunebuggy
- Amphibiousvehicle
- Threewheeled
- Limousinepassenger
- Workutilityvehicle

In [22]:
rejected_bodystyles = ('Wheeledatv', 'Lowspeedvehicle', 'Snowmobile', 'Dualpurpose', 'Golfcart',
                       'Dunebuggy', 'Amphibiousvehicle', 'Threewheeled', 'Limousinepassenger',
                      'Workutilityvehicle')

years_df = years_df.loc[~years_df['Body Style'].isin(rejected_bodystyles)]
bodystyles = years_df['Body Style'].unique()
print(f"Unique body styles = {bodystyles}")

num_rows, num_cols = years_df.shape
print(f"Data set shape {num_rows} rows x {num_cols} cols")

Unique body styles = ['Fourdoorstationwagon' 'Twodoorfastback' 'Fourdoorcoupe'
 'Fourdoorfastback' 'Fourdoorsedan' 'Twodoorconvertible'
 'Twodoorstationwagon' 'Fourdoorhardtop' 'Hatchback' 'Twodoorcoupe'
 'Twodoorsedan' 'Sportconvertible' 'Twodoorhardtop'
 'Workutilitypassengervehicle' 'Fourdoorconvertible']
Data set shape 586220 rows x 13 cols


## Fuel type and hybrid investigation
- The hybrid vehicle indicator field is a Boolean field.
- The fuel type field is a string field.
- Let's see how these two fields relate to each other.

In [23]:
hybrids_df = years_df.loc[years_df['Hybrid Vehicle Indicator'] == 'Yes']
hybrid_fuel_types = hybrids_df['Fuel Type'].unique()
print(f"{hybrid_fuel_types=}")

hybrid_fuel_types=array(['Gasoline', 'Gasoline Electric', 'Electric', 'Multifuels',
       'Gasoline Alcohol', 'Gasoline Natural Gas', 'Natural Gas',
       'Butane', 'Diesel', 'Hydrogen', 'Diesel Natural Gas',
       'Gasoline Propane'], dtype=object)


In [24]:
# Check if there's an error in hybrid with Fuel Type = 'Electric'
print(hybrids_df[hybrids_df['Fuel Type'] == 'Electric'])

       Vehicle Use Anti Theft Device Indicator            Body Style  \
556       Personal                          No  Fourdoorstationwagon   
12069     Personal                         Yes  Fourdoorstationwagon   
30585     Personal                         Yes         Fourdoorsedan   
30586     Personal                         Yes         Fourdoorsedan   
30587     Personal                         Yes         Fourdoorsedan   
...            ...                         ...                   ...   
684031    Personal                         Yes             Hatchback   
684032    Personal                         Yes             Hatchback   
684033    Personal                         Yes             Hatchback   
690368    Business                         Yes  Fourdoorstationwagon   
690527    Business                         Yes         Fourdoorsedan   

       Electric_Vehicle_Indicator Fleet Vehicle Indicator Fuel Type  \
556                            No                      No  Elect

## Hybrid vehicles fuel type
- It seems that there was some confusion between gasoline hybrid vehicles (you only put in gasoline) and gasoline PHEVs/plug-ins (you put in both gasoline and electricity).
- To fix this field, we can use our subject matter knowledge. Honda made only one PHEV - the Clarity. If Clarity does not appear in the Model string, then it's a regular gasoline hybrid. Toyota makes two PHEVs, the Rav4 Prime/Plug In and the Prius Prime/Plug In. Unfortunately, these also come with regular gasoline hybrid powetrains. Other Toyotas must be regular gasoline hybrids. 