In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, CheckboxButtonGroup, CustomJS, HoverTool
from bokeh.layouts import column
from bokeh.palettes import Spectral5
from sklearn.linear_model import LinearRegression
from bokeh.palettes import Colorblind5
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, Span, Legend, LegendItem
from bokeh.plotting import figure, show, output_notebook
from bokeh.palettes import Category20, Category20b, Category20c
from math import pi
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Table of Contents
* [1. Motivation](#chapter1)
* [2. Basic stats](#chapter2)
* [3. Data Analysis](#chapter3)
* [4. Genre](#chapter4)
* [5. Visualizations](#chapter5)
    * [5.1 Current Sales/Trends with Electric Vehicles Worldwide](#section_3_1)
        * [5.1.1 Market Growth](#sub_section_3_1_1)
        * [5.1.2 Fully-electric cars vs. plug-in hybrids](#sub_section_3_1_1)
        * [5.1.3 Top Manufacturers](#sub_section_3_1_1)
    * [5.2 ML - Random Forest](#section_3_1)
* [6. Discussion](#chapter6)
* [7. Contributions](#chapter7)

## 1. Motivation

## 2. Basic Stats

## 3. Data Analysis

## 4. Genre

## 5. Visualizations

## 5.1 Current Sales/Trends with Electric Vehicles Worldwide
Electric vehicle sales are on the rise globally, as more people is deciding to switch to "greener" forms of transportation. This trend is driven by a growing awareness of environmental issues and a shared interest in eco-friendly technology. It is clear that both the hype around and the actual move toward electric mobility are accelerating, reflecting a collective effort to tackle climate change.

### 5.1.1 Market Growth:

- Sales of electric cars started from a low base but are growing quickly in many markets.
- Globally, around 1-in-4 new cars sold were electric in 2023. In **Norway**, this share was over 90%, and in **China**, it was almost 40%.
- In the following visualization, “electric cars” include fully battery-electric vehicles and plug-in hybrids.

The dataset is about a share of new cars sold that are electric for 13 years period (2010-2023), in other words, how many of total cars sold within a year were EVs. In the following section, USA (where Palo Alto is located) will be compared to Denmark, EU, Noraway and China. 

[1] https://www.iea.org/reports/global-ev-outlook-2023

In [2]:
data=pd.read_csv('electric-car-sales-share.csv')

In [3]:
required_countries = ["United States", "European Union (27)", "China", "Denmark", "Norway","World"]
required_countries_data = data[data['Entity'].isin(required_countries)]

output_notebook()
source = ColumnDataSource(required_countries_data)

#Bokeh figure
p = figure(title="Electric Car Sales Share Comparison",
           x_axis_label='Year', y_axis_label='Electric Cars Sales Share (%)')

#each country's data with a unique color
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
] 
renderers = {}
for i, country in enumerate(required_countries):
    country_data = required_countries_data[required_countries_data['Entity'] == country]
    cds = ColumnDataSource(country_data)
    renderers[country] = p.line(x='Year', y='Share of new cars that are electric', 
                                source=cds, color=colors[i % len(colors)], 
                                line_width=2, legend_label=country)

p.legend.click_policy = "hide"
p.legend.location = 'top_left'
#CheckboxButtonGroup with all countries selected by default
checkbox_button_group = CheckboxButtonGroup(labels=required_countries, 
                                            active=list(range(len(required_countries))))

#JavaScript callback to show/hide lines
callback = CustomJS(args=dict(renderers=renderers), code="""
    for (var country in renderers) {
        renderers[country].visible = false;
    }
    for (var i of cb_obj.active) {
        var country = cb_obj.labels[i];
        renderers[country].visible = true;
    }
""")
checkbox_button_group.js_on_change('active', callback)
layout = column(checkbox_button_group, p)
show(layout)

In [4]:
output_notebook()
p = figure(title="Electric Car Sales Share Comparison", x_axis_label='Year', y_axis_label='Electric Cars Sales Share (%)')

colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
] 
legend_items = []

for i, country in enumerate(required_countries):
    country_data = data[data['Entity'] == country]
    country_data = country_data.sort_values('Year')
    
    X = country_data['Year'][:-1].values.reshape(-1, 1)  # Exclude the last year for prediction
    y = country_data['Share of new cars that are electric'][:-1]
    reg = LinearRegression().fit(X, y)
    #predict the next year's value based on the model
    predicted_value = reg.predict([[country_data['Year'].values[-1] + 1]])
    #test
    line = p.line(x='Year', y='Share of new cars that are electric', source=ColumnDataSource(country_data), color=colors[i], line_width=2, legend_label=country)
    #predicted
    pred_line = p.line(x=[country_data['Year'].values[-1], country_data['Year'].values[-1] + 1], 
                       y=[country_data['Share of new cars that are electric'].values[-1], predicted_value[0]], 
                       color=colors[i], line_width=2, line_dash='dashed', legend_label=country)
    
    legend_items.append((country, [line, pred_line]))

p.legend.click_policy = "hide"
p.legend.location = 'top_left'
p.add_layout(Span(location=country_data['Year'].values[-1], dimension='height', line_dash='dashed', line_color='gray'))
show(p)

From the plot above it can be observed that Norway has the highest share of new cars that are electric by far, compared to other states. Such a result is expected, since Norway is also known for its very generous subsidy for EVs, as explained here https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/norway/incentives-legislations. Surprisingly, the USA is below the world's average, mostly due to the cheaper gas prices compared to other countries. Interestingly, China surpassed the EU in 2018 but still remained on lower sales than Denmark, which has the highest share of new cars that are electric within the EU.

### 5.1.2 Fully-electric cars vs. plug-in hybrids:

“Electric cars” include battery-electric and plug-in hybrid vehicles. The difference is that fully battery-electric cars do not have an internal combustion engine, whereas plug-in hybrids have a rechargeable battery and electric motor and an internal combustion engine that runs on gasoline.

- Plug-in hybrid could be driven as a standard petrol car if the owner did not charge the battery. 

- The battery in plug-in hybrids is smaller and has a shorter range than battery-electric cars, so over longer distances, the car starts running on gasoline once the battery has run out.

- Since plug-in hybrids will often run on petrol, they tend to emit more carbon than battery-electric cars. However, they do usually have lower emissions than petrol or diesel cars.

In the barcharts below we will compare the share of Battery-electric and Plug-in hybrid cars sold across different states and worldwide.

[2] https://www.iea.org/reports/global-ev-outlook-2023

In [5]:
car_sales_data = pd.read_csv('share-car-sales-battery-plugin.csv')
#Grouping data by 'Year' and 'Entity'
car_sales_grouped = car_sales_data.groupby(['Entity', 'Year']).sum().reset_index()
countries = ["United States", "China", "European Union (27)", "Denmark", "Norway","World"]
output_notebook()

#list to hold the individual plots
plots = []

lighter_blue = "#6baed6"
lighter_red = "#fb6a4a"
colors = Colorblind5[:2]

#plot for each country
for i, country in enumerate(countries):
    # Filter the data for the country
    country_sales = car_sales_grouped[car_sales_grouped['Entity'] == country]
    source = ColumnDataSource(country_sales)
    
    #figure with a title and axis labels
    p = figure(title=f"Share of New Cars Sold in {country} that are Battery-Electric and Plug-in Hybrid",
               x_axis_label='Year', y_axis_label='Share of Cars Sold (%)',
               height=450, width=600, tools="")

    #Stack the bars for 'Battery-electric' and 'Plug-in hybrid'
    p.vbar_stack(['Battery-electric as a share of cars sold', 'Plug-in hybrid as a share of cars sold'],
                 x='Year', width=0.9, color=colors, source=source,
                 legend_label=['Battery-electric', 'Plug-in hybrid'])

    p.xaxis.major_label_orientation = 1.57 / 4
    p.legend.location = "top_left"
    p.legend.click_policy = "hide"
    plots.append(p)

#grid layout of plots
grid = gridplot(plots, ncols=2)
show(grid)

It can be observed that Norway and Denmark are the only ones from above which have decline of share of Plug-in hybrid cars sold in the last year. In all of the displayed bar plots, the exponential growth of Battery-electric EVs is observed. That implies growing awareness of environmental issues and a shared interest in eco-friendly technology, together with the partial transition towards fully electric cars by a large share of the population represented by Plug-in hybrid sales. Transition to fully electric cars has yet not been established due concernc about vehicle range, and many opt for hybrids or gasoline-fueled vehicles to travel longer distances.

### 5.1.3 Top Manufacturers:

Sales of plug-in electric vehicles (PEVs) grew rapidly from 2011 to 2018. Technology improvements, cost reduction, increasing model choice, maturing charging infrastructure, and economic recovery have continued to influence and support increased sales. Until 2018, the Chevrolet Volt had been on the market the longest and had the most overall sales, but the model was discontinued in 2019. In 2018, the newly introduced Tesla Model 3 rapidly increased vehicle sales and established the vehicle as the best-selling plug-in electric vehicle with nearly 50% of the market share. In the plot below, the top 10 EV manafacturers by total sales for period from 2012-2019 will be displayed.

[3] https://www.anl.gov/esia/light-duty-electric-drive-vehicles-monthly-sales-updates

In [6]:
sales_data = pd.read_csv('Electric Car Sales by Model in USA.csv')

output_notebook()

# Make sure you have the correct dataset in 'top_makes_long' before you plot.
for year in range(2012, 2020):
    sales_data[str(year)] = pd.to_numeric(sales_data[str(year)].astype(str).str.replace(',', ''), errors='coerce')

# Group by 'Make' and sum the sales for each year, summing only numeric columns
sales_by_make = sales_data.groupby('Make').sum(min_count=1)

# Now, calculate the 'Total_Sales' for each 'Make' by summing across the numeric year columns only
sales_by_make['Total_Sales'] = sales_by_make[[str(year) for year in range(2012, 2020)]].sum(axis=1)

# Get the top 10 makes by total sales
top_makes = sales_by_make['Total_Sales'].nlargest(10).index.tolist()
top_makes_sales = sales_by_make.loc[top_makes].reset_index()

# Melt the DataFrame to long format for plotting
top_makes_long = pd.melt(top_makes_sales, id_vars=['Make'], var_name='Year', value_name='Sales')


# Convert 'Year' to string for categorical data plotting
top_makes_long['Year'] = top_makes_long['Year'].astype(str)

# Make sure the years are sorted
years_sorted = sorted(top_makes_long['Year'].unique())
years_sorted = [str(year) for year in range(2012, 2020)]

# Create a color map from makes to colors
makes = top_makes_long['Make'].unique().tolist()
colors = Category20[len(makes)] if len(makes) <= 20 else Category20[20] + Category20[len(makes)-20]
make_color = {make: color for make, color in zip(makes, colors)}
# Create a color map from makes to colors
makes = top_makes_long['Make'].unique().tolist()
colors = Category20[len(makes)] if len(makes) <= 20 else Category20[20] + Category20[len(makes) - 20]
make_color = {make: color for make, color in zip(makes, colors)}

output_notebook()
source = ColumnDataSource(top_makes_long)

p = figure(x_range=years_sorted, title="Total Car Sales by Make and Year (Top 10)",
           x_axis_label='Year', y_axis_label='Total Sales', sizing_mode="stretch_width", height=600)

#lines for each make with a unique color
for make in makes:
    make_data = top_makes_long[(top_makes_long['Make'] == make) & (top_makes_long['Year'].isin(years_sorted))]
    make_source = ColumnDataSource(make_data)
    p.line(x='Year', y='Sales', source=make_source, legend_label=make, color=make_color[make])
    p.circle(x='Year', y='Sales', source=make_source, legend_label=make, color=make_color[make], size=8)

p.legend.click_policy = "hide"
p.legend.title = 'Make'
hover = HoverTool()
hover.tooltips = [("Make", "@Make"), ("Sales", "@Sales{0,0}"), ("Year", "@Year")]
p.add_tools(hover)
p.xaxis.major_label_orientation = pi / 4
show(p)

From the plot above it can be observed that all the manufacturers have increased sales through the last 10 years. Until 2017 Chevrolet had the most sales, when Tesla took the lead with further exponential growth. That reflects a (closely correlated) share of new EVs sold worldwide. 

## 5.2 ML - Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

data = pd.read_csv('EVChargingStationUsage.csv')

#create lagged features
def buildLaggedFeatures(s, lag=30, dropna=True):
    """
    Assumes that 's' is a pandas Series with a datetime index.
    """
    df = pd.concat([s.shift(i) for i in range(lag + 1)], axis=1)
    df.columns = ['lag_{}'.format(i) if i != 0 else s.name for i in range(lag + 1)]
    if dropna:
        df = df.dropna()
    return df

#Convert to datetime
data['Start DateTime'] = pd.to_datetime(data['Start Date'] + ' ' + data['Start Time Zone'], errors='coerce')
data.dropna(subset=['Start DateTime'], inplace=True)

#Group by date and sum the 'Energy (kWh)' for each day across all stations
daily_energy = data.groupby(data['Start DateTime'].dt.floor('D'))['Energy (kWh)'].sum()

cutoff_days = 10  
lagged_features = buildLaggedFeatures(daily_energy, lag=30)
train_data = lagged_features.iloc[:-30-cutoff_days]  #Exclude the last 30 days plus cutoff_days for training
test_data = lagged_features.iloc[-30-cutoff_days:-cutoff_days]  #Test data excludes the cutoff at the end as well

#model training and predictions
X_train = train_data.drop(columns=['Energy (kWh)'])
y_train = train_data['Energy (kWh)']
X_test = test_data.drop(columns=['Energy (kWh)'])
y_test = test_data['Energy (kWh)']
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

#Prediction and RMSE calculation
y_pred = rf_regressor.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE for All Stations: {rmse}')

data = {'date': y_test.index,
        'actual': y_test,
        'forecasted': y_pred}
source = ColumnDataSource(data)

p = figure(width=1000, height=600, x_axis_type='datetime', title='Daily Energy Consumption for All Stations - 30 Days Forecast')
p.line(x='date', y='actual', line_width=2, color="#6baed6", legend_label='Actual', source=source)
p.line(x='date', y='forecasted', line_width=2, color="#fb6a4a", legend_label='Forecasted', source=source)
p.add_tools(HoverTool(tooltips=[("Date", "@date{%F}"), 
                                ("Actual", "@actual{0.2f} kWh"), 
                                ("Forecasted", "@forecasted{0.2f} kWh")],
                     formatters={'@date': 'datetime'}, mode='vline'))
p.legend.location = 'top_left'
p.legend.click_policy = "hide"
p.xaxis.axis_label = "Date"
p.yaxis.axis_label = "Energy (kWh)"
p.xaxis.formatter.days = '%Y-%m-%d'
show(p)

RMSE for All Stations: 128.9409593573127


## 6. Discussion

## 7. Contributions

| Section       | Andro | Matija | Lucian |
|---------------|----------|----------|----------|
| Motivation  |          |          |          |
| Basic Stats |      |          |          |
| Data Analysis   |          |          |          |
| Genre   |     100%     |    0%      |     0%     |
| Visualizations       |          |          |          |
| Discussion    |     0%     |    100%      |    0%      |