In [None]:
import pandas as pd
import numpy as np
import math as mt
import streamlit as st
import plotly.express as px

In [None]:
# Import Data
df = pd.read_csv('vehicles_us.csv')

In [None]:
# EDA
df.info()

**Initial Observations**
- `Model_year`, `cylinders`, `odometer`, `paint_color`, and `is_4wd` have null values.
-All null value columns except `is_4wd` can be filled with N/A. `is_4wd` can be filled with zero to indicate no 4-wheel drive
- `Model` column currently contains the make and model of the car in the same value. Would be better to split this in to `Make` and `Model` columns so better analysis can be performed.

In [None]:
df.head(10)

In [None]:
# Rename model column to make_model to prepare for change
df = df.rename(columns={'model':'make_model'})
df.info()

In [None]:
#Check cell for comparison later
df['make_model'].head(10)

In [None]:
# Get unique values from the model category
df['make_model'].unique()

In [None]:
# Split items in make_model column and assign them to individual make and model columns
df['make'] = df['make_model'].str.split().str[0]
df['model'] = df['make_model'].str.split().str[1:]
df.info()

In [None]:
# Above change leaves the model column as a series of lists. Below will concatenate the lists into single spaced strings
df['model'] = df['model'].apply(lambda x:' '.join(x))
df['model'].head(10)

In [None]:
# Fill various null values in columns with N/A
df['cylinders'] = df['cylinders'].fillna('NA')
df['odometer'] = df['odometer'].fillna('NA')
df['paint_color'] = df['paint_color'].fillna('NA')
df['model_year'] = df['model_year'].fillna(0.0)

# Fill null values in is_4wd column with 0.00 to indicate no 4-wheel drive
df['is_4wd'] = df['is_4wd'].fillna(0.0)

#Check for duplicated rows
print(df.duplicated().unique())

#Check vehicle info again
print(df.info())

In [None]:
df['model_year'] = df['model_year'].astype('int64')

In [None]:
# Test Streamlit Apps
st.header('Data viewer') 
st.dataframe(df)

# Create Vehicle types by manufacturer histogram
st.header('Vehicle type by Manufactuer')
fig = px.histogram(df, x='make',color='type')
# Display with streamlit
st.write(fig)

In [None]:
df.info()