In [11]:
import plotly.express as px
import pandas as pd

# Load the dataset
vehicles_us = pd.read_csv('vehicles_us.csv')
print(vehicles_us.head())
# Check for missing values and data types
print(vehicles_us.info())
# Check for duplicates
print(f"Number of duplicate rows: {vehicles_us.duplicated().sum()}")
# Check for missing values
print(f"Number of missing values:\n{vehicles_us.isnull().sum()}")
# Drop rows with missing values
vehicles_us = vehicles_us.dropna()
# Print column names
print(f"Column names: {vehicles_us.columns.tolist()}")

   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       NaN   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV         NaN     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup         NaN     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Col

In [10]:
# Distribution of price
fig1 = px.histogram(vehicles_us,
                    x='price',
                    nbins=100,
                    title='Distribution of Vehicle Prices')

fig1.update_xaxes(range=[0, 50000])

fig1.update_layout(
    title={
        'text': '<b>Distribution of Vehicle Prices</b>',
        'x': 0.5, 
        'xanchor': 'center',
        'font': {'size': 16}
    },
    xaxis_title={
        'text': '<b>Price</b>',
        'font': {'size': 12}
    },
    yaxis_title={
        'text': '<b>Vehicle Count</b>', 
        'font': {'size': 12}
    }
)

fig1.show()

In [12]:
# Distribution of odometer readings
fig2 = px.histogram(vehicles_us,
                    x='odometer',
                    nbins=100,
                    title='Distribution of Odometer Readings')

# Limitar el eje X a 300,000
fig2.update_xaxes(range=[0, 300000])

# Personalizar el título y las etiquetas de los ejes
fig2.update_layout(
    title={
        'text': '<b>Distribution of Odometer Readings</b>', 
        'x': 0.5, 
        'xanchor': 'center', 
        'font': {'size': 16} 
    },
    xaxis_title={
        'text': '<b>Odometer</b>', 
        'font': {'size': 12} 
    },
    yaxis_title={
        'text': '<b>Vehicle Count</b>', 
        'font': {'size': 12} 
    }
)

fig2.show()

In [21]:
# Count of vehicles by manufacturer and model
# Create the 'manufacturer' column directly in the original DataFrame
vehicles_us['manufacturer'] = vehicles_us['model'].str.split(' ').str[0]
vehicles_us['manufacturer'] = vehicles_us['manufacturer'].str.capitalize()

# Calculate counts by the new 'manufacturer'
model_manufacturer_counts = vehicles_us.groupby(['manufacturer', 'model']).size().reset_index(name='Count')

manufacturer_total_counts_ordered = vehicles_us['manufacturer'].value_counts().index.tolist()

model_manufacturer_counts['manufacturer'] = pd.Categorical(
    model_manufacturer_counts['manufacturer'],
    categories=manufacturer_total_counts_ordered,
    ordered=True
)

model_manufacturer_counts = model_manufacturer_counts.sort_values(
    by=['manufacturer', 'Count'],
    ascending=[True, False]
)

# Generate the bar chart
fig3 = px.bar(model_manufacturer_counts,
              x='Count',
              y='manufacturer',
              color='model',
              labels={'manufacturer': 'Manufacturer', 'model': 'Model', 'Count': 'Number of Vehicles'},
              title='Number of Vehicles by Manufacturer (Stacked by Model)')

fig3.update_layout(
    title={
        'text': '<b>Number of Vehicles by Manufacturer (Stacked by Model)</b>',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 14}
    },
    xaxis_title={
        'text': '<b>Number of Vehicles</b>',
        'font': {'size': 12}
    },
    yaxis_title={
        'text': '<b>Manufacturer</b>',
        'font': {'size': 12}
    },
    barmode='stack',
    showlegend=False,
    yaxis={'categoryorder': 'total ascending'}
)

fig3.show()

In [19]:
# Scatter plot: price vs. model year colored by condition
fig4 = px.scatter(vehicles_us,
                  x='model_year',
                  y='price',
                  color='condition',
                  title='Price vs. Model Year Colored by Condition',
                  hover_data=['model']) # 'model_year' is now on the x-axis, so 'model' is useful for hover.

# Apply consistent styling to the plot
fig4.update_layout(
    title={
        'text': '<b>Price vs. Model Year Colored by Condition</b>', # Title in bold
        'x': 0.5, # Center the title
        'xanchor': 'center', # Anchor title to center
        'font': {'size': 14} # Title font size
    },
    xaxis_title={
        'text': '<b>Model Year</b>', # X-axis label in bold
        'font': {'size': 12} # X-axis label font size
    },
    yaxis_title={
        'text': '<b>Price</b>', # Y-axis label in bold
        'font': {'size': 12} # Y-axis label font size
    }
)

fig4.show()

In [26]:
# Box plot: price by type
vehicles_us['type_capitalized'] = vehicles_us['type'].str.capitalize()

fig5 = px.box(vehicles_us,
              x='price',  
              y='type_capitalized', 
              title='Price Distribution by Vehicle Type')

fig5.update_xaxes(range=[0, 80000])

fig5.update_layout(
    title={
        'text': '<b>Price Distribution by Vehicle Type</b>', 
        'x': 0.5, 
        'xanchor': 'center', 
        'font': {'size': 14} 
    },
    xaxis_title={
        'text': '<b>Price</b>', 
        'font': {'size': 12} # X-axis label font size
    },
    yaxis_title={
        'text': '<b>Vehicle Type</b>', # Y-axis label in bold
        'font': {'size': 12} # Y-axis label font size
    }
)

fig5.show()

vehicles_us = vehicles_us.drop(columns=['type_capitalized'])