In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the Data
df = pd.read_csv('colorado_vehicle_sales.csv')  # Assuming your CSV is in the same directory

# 2. Basic Data Inspection
print("First 5 rows of the dataframe:")
print(df.head())
print("\nDataframe information:")
print(df.info())
print("\nSummary statistics of numerical columns:")
print(df.describe())
print("\nNumber of missing values per column:")
print(df.isnull().sum())
print("\nUnique values in 'County' column:")
print(df['County'].unique())

# 3. Data Cleaning and Preprocessing
# Convert 'Year' and 'Quarter' to appropriate data types
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)

# Create a 'Date' column for time series analysis
df['Date'] = pd.to_datetime(df['Year'].astype(str) + 'Q' + df['Quarter'].astype(str))

# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# 4. Univariate Analysis
# Histograms for 'Sales'
plt.figure(figsize=(10, 6))
sns.histplot(df['Sales'], kde=True)
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.savefig('images/sales_distribution.png')
plt.show()

# Bar charts for 'Year'
plt.figure(figsize=(12, 6))
sns.countplot(x='Year', data=df)
plt.title('Sales Count by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig('images/sales_count_by_year.png')
plt.show()

# Bar charts for 'Quarter'
plt.figure(figsize=(8, 6))
sns.countplot(x='Quarter', data=df)
plt.title('Sales Count by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Count')
plt.savefig('images/sales_count_by_quarter.png')
plt.show()

# Value counts for 'County'
print("\nValue counts for 'County':")
print(df['County'].value_counts())

# 5. Bivariate Analysis
# Scatter plot of 'Sales' vs. 'Date'
plt.figure(figsize=(15, 6))
sns.scatterplot(x='Date', y='Sales', data=df)
plt.title('Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.savefig('images/sales_over_time.png')
plt.show()

# Box plots of 'Sales' by 'County'
plt.figure(figsize=(18, 8))
sns.boxplot(x='County', y='Sales', data=df)
plt.title('Sales Distribution by County')
plt.xlabel('County')
plt.ylabel('Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout() # to prevent x labels from being cut off.
plt.savefig('images/sales_distribution_by_county.png')
plt.show()

# Line plots of 'Sales' over time for different counties
plt.figure(figsize=(18, 10))
for county in df['County'].unique():
    county_data = df[df['County'] == county]
    plt.plot(county_data['Date'], county_data['Sales'], label=county)
plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('Sales Trends Over Time by County')
plt.legend(loc="upper left", fontsize="small")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('images/sales_trends_by_county.png')
plt.show()

# Correlation matrix
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.savefig('images/correlation_matrix.png')
plt.show()

# 6. Multivariate Analysis (Example: Sales by County and Year)
plt.figure(figsize=(20, 10))
sns.barplot(x='Year', y='Sales', hue='County', data=df)
plt.title('Sales by County and Year')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('images/sales_county_year.png')
plt.show()

# 7. Time Series Analysis (Example: Total Sales Over Time)
total_sales_time = df.groupby('Date')['Sales'].sum().reset_index()
plt.figure(figsize=(15, 6))
plt.plot(total_sales_time['Date'], total_sales_time['Sales'])
plt.title('Total Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.savefig('images/total_sales_time.png')
plt.show()

# Add more time series analysis as needed (e.g., decomposition).

# 8. Insights and Storytelling (Add text explanations throughout the notebook)
# For example:
print("\nInsights:")
print("- Sales fluctuate over time, with potential seasonal patterns.")
print("- Some counties consistently have higher sales than others.")
print("- The correlation matrix shows...") # Describe any correlations you find.