# Test Notebook


# Introduction

<!--  -->TODO: describe topic and perspectives


In [2]:
# Import packages
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd


In [3]:
# Retrieve dataset data.
chunk_size = 1000000
dtypes = {"Country": str, "Mk": str, "Cn": str, "m (kg)": float, "Enedc (g/km)": float, "Ewltp (g/km)": float, "W (mm)": float, "Ft": str, "Ernedc (g/km)": float, "Erwltp (g/km)": float, "year": int}
chunk_container = pd.read_csv("CO2_data.csv", dtype=dtypes, chunksize=chunk_size)

# Split the dataset based on year.
co2_2015 = pd.DataFrame()
co2_2016 = pd.DataFrame()
co2_2017 = pd.DataFrame()
co2_2018 = pd.DataFrame()
co2_2019 = pd.DataFrame()
co2_2020 = pd.DataFrame()
co2_2021 = pd.DataFrame()
for chunk in chunk_container:
    co2_2015 = pd.concat([co2_2015, chunk[chunk["year"] == 2015]], ignore_index=True)
    co2_2016 = pd.concat([co2_2016, chunk[chunk["year"] == 2016]], ignore_index=True)
    co2_2017 = pd.concat([co2_2017, chunk[chunk["year"] == 2017]], ignore_index=True)
    co2_2018 = pd.concat([co2_2018, chunk[chunk["year"] == 2018]], ignore_index=True)
    co2_2019 = pd.concat([co2_2019, chunk[chunk["year"] == 2019]], ignore_index=True)
    co2_2020 = pd.concat([co2_2020, chunk[chunk["year"] == 2020]], ignore_index=True)
    co2_2021 = pd.concat([co2_2021, chunk[chunk["year"] == 2021]], ignore_index=True)
co2_2021.head()


Unnamed: 0,Cn,Country,Enedc (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Ft,Mk,W (mm),m (kg),year
0,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
1,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
2,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
3,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
4,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021


# Dataset and preprocessing

<!--  -->TODO: describe dataset and how we preprocess them


In [4]:
# Set to true to convert data to csv. False otherwise.
to_csv = False

In [5]:
# Compute data for bar graph with average CO2 emission per country
country_emission = pd.DataFrame()
for df in [co2_2018, co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["Country", "year", "Ewltp (g/km)"]]
    # Group by country and year and compute mean and std.
    df = df.groupby(["Country", "year"]).agg({"Ewltp (g/km)": ['mean', 'std']}).reset_index()
    # Change mutliindex to single index.
    df.columns = df.columns.map(' '.join).str.strip()
    print(df)
    country_emission = pd.concat([country_emission, df], ignore_index=True)

if to_csv:
    country_emission.to_csv("country_emission.csv")


   Country  year  Ewltp (g/km) mean  Ewltp (g/km) std
0       AT  2018         145.972987         47.059240
1       BE  2018         146.957196         27.408963
2       BG  2018         149.522771         41.876921
3       CY  2018         143.808365         30.649560
4       CZ  2018         159.296766         36.592046
5       DE  2018         154.979487         40.299402
6       DK  2018         139.274798         25.036228
7       EE  2018         157.704236         27.920213
8       ES  2018         146.405005         26.798571
9       FI  2018         140.103806         37.205318
10      FR  2018         138.370309         27.845793
11      GB  2018         151.319423         37.638863
12      GR  2018         136.292338         18.582752
13      HR  2018         141.044349         24.246063
14      HU  2018         150.811770         38.868325
15      IE  2018         137.711411         25.155394
16      IS  2018         120.500000         71.624866
17      IT  2018         144

In [6]:
# Plot bar graph with average CO2 emission reduced by innovative technologies per country
fig = px.bar(country_emission,
    x="Country",
    y="Ewltp (g/km) mean",
    error_y="Ewltp (g/km) std",
    facet_col="year",
    facet_col_wrap=1,
    title="Visualisatie 2: Gemiddelde CO2-uitstoot vermindering vanwege innovatieve technologieën per land in de EU tussen 2019 en 2021",
    height=800,
    labels={
        "Ewltp (g/km) mean": "CO2 emission WLTP (g/km)"
    },
)
fig.for_each_annotation(lambda a: a.update(text=a.text.replace('year', 'Jaar')))
fig.show()


Caption

In [7]:
# Compute data for bar graph with average CO2 emission reduced by innovative technologies per country
country_emission_reduction = pd.DataFrame()
for df in [co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["Country", "year", "Erwltp (g/km)"]]
    # Group by country and year and compute mean and std.
    df = df.groupby(["Country", "year"]).agg({"Erwltp (g/km)": ['mean', 'std']}).reset_index()
    # Change mutliindex to single index.
    df.columns = df.columns.map(' '.join).str.strip()
    country_emission_reduction = pd.concat([country_emission_reduction, df], ignore_index=True)

if to_csv:
    country_emission_reduction.to_csv("country_emission_reduction.csv")


In [8]:
# Plot bar graph with average CO2 emission reduced by innovative technologies per country
fig = px.bar(country_emission_reduction,
    x="Country",
    y="Erwltp (g/km) mean",
    error_y="Erwltp (g/km) std",
    facet_col="year",
    facet_col_wrap=1,
    title="CO2 emission reduction by country in EU from 2019-2021",
    height=800,
    labels={
        "Erwltp (g/km) mean": "CO2 emission reduction WLTP (g/km)"
    },
)
fig.show()


In [9]:
# Compute pie chart with average CO2 emission per fuel type.
ft_mean_emission = co2_2021.groupby(co2_2021["Ft"].str.lower())["Ewltp (g/km)"].mean()

if to_csv:
    ft_mean_emission.to_csv("ft_mean_emission.csv")


In [10]:
# Plot pie chart with average CO2 emission per fuel type.
fig = px.pie(ft_mean_emission,
    names=ft_mean_emission.index,
    values="Ewltp (g/km)",
    title="CO2 emission by fuel type in 2021",
    hole=0.8,
    labels={
        "Ewltp (g/km)": "CO2 emission WLTP (g/km)"
    },
)
fig.update_layout(showlegend=False)
fig.update_traces(textposition='outside', textinfo="label + percent")
fig.show()


In [11]:
# Compute average emission in EU.
years = []
mean_emission = []
for df in [co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
    mean_emission.append(df["Ewltp (g/km)"].mean())
    years.append(df["year"][0])

eu_emission = pd.DataFrame({"year": years, "Ewltp (g/km)": mean_emission})

if to_csv:
    eu_emission.to_csv("eu_emission.csv")


In [12]:
# Plot average emission in EU over the years.
fig = px.line(eu_emission, x="year", y="Ewltp (g/km)", title="Average emission over the years in EU.")
fig.update_xaxes(type='category')
fig.show()


In [13]:
# Lijst om de resultaten in op te slaan
emissions_results = []

# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
    # Doe de berekeningen
    emission_result = chunk.groupby(['year', 'Ft'])["Ewltp (g/km)"].sum().reset_index(name='total_emission')
    # Voeg het resultaat toe aan de lijst
    emissions_results.append(emission_result)



Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.



In [14]:

# Concateneer alle resultaten in een dataframe
df_emissions = pd.concat(emissions_results)

# Bereken de totale CO2-uitstoot per jaar
total_emission = df_emissions.groupby('year')["total_emission"].sum().reset_index()
total_emission.to_csv("total_emission.csv")

In [15]:
# Maak de CO2-uitstoot lijndiagram
fig = px.line(total_emission, x="year", y="total_emission", 
              title="Total CO2 Emission (2017-2021)", 
              labels={'total_emission':'Total CO2 Emission', 'year':'Year'})
fig.show()

In [16]:
# Lijst om de resultaten in op te slaan
counts_results = []

# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
    # Classificeer elektrische en hybride auto's als 'Elektrisch', en alle anderen als 'Anders'
    chunk['Ft'] = chunk['Ft'].apply(lambda x: 'Elektrisch' if 'electric' in x or 'hybrid' in x else 'Anders')
    
    # Doe de berekeningen
    count_result = chunk.groupby(['year', 'Ft'], group_keys=False).size().reset_index(name='counts')
    # Voeg het resultaat toe aan de lijst
    counts_results.append(count_result)



Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.



In [17]:
# Concateneer alle resultaten in een dataframe
df_counts = pd.concat(counts_results)

# Splits bij jaar en brandstoftype
df_counts = df_counts.groupby(["year", "Ft"]).agg({"counts": 'sum'}).reset_index()

# Bereken totale emissie per jaar.
df_counts["sum"] = df_counts.groupby("year")["counts"].transform('sum')

# Bereken de percentages
df_counts["percent"] = 100 * df_counts["counts"] / df_counts["sum"]
df_counts.to_csv("fuel_type_distribution.csv")

print(df_counts)

    year          Ft    counts       sum    percent
0   2015      Anders    415525    419642  99.018926
1   2015  Elektrisch      4117    419642   0.981074
2   2016      Anders    473778    479015  98.906715
3   2016  Elektrisch      5237    479015   1.093285
4   2017      Anders   4860566   4936747  98.456858
5   2017  Elektrisch     76181   4936747   1.543142
6   2018      Anders  14600310  14900305  97.986652
7   2018  Elektrisch    299995  14900305   2.013348
8   2019      Anders   7996139   9864373  81.060793
9   2019  Elektrisch   1868234   9864373  18.939207
10  2020      Anders   7996139   9864373  81.060793
11  2020  Elektrisch   1868234   9864373  18.939207
12  2021      Anders   7996139   9864373  81.060793
13  2021  Elektrisch   1868234   9864373  18.939207


In [18]:
# Definieer kleuren voor de brandstoftypes
unique_ft = df_counts['Ft'].unique()
colors = ['blue' if ft == 'Elektrisch' else 'grey' for ft in unique_ft]
color_dict = dict(zip(unique_ft, colors))

# Maak het percentage staafdiagram
fig = px.bar(df_counts, x="year", y="percent", color="Ft", 
             title="Fuel Type Distribution (2017-2021)", 
             labels={'percent':'Percentage of Cars', 'year':'Year', 'Ft':'Fuel Type'}, 
             hover_data=['counts'], 
             color_discrete_map=color_dict) # Gebruik de kleurenkaart
fig.show()

In [51]:

total_emission_per_ft = co2_2021.groupby('Ft').agg({"Ewltp (g/km)": ['sum', 'size']})
total_emission_per_ft.columns = total_emission_per_ft.columns.map(' '.join).str.strip()
total_emission_per_ft = total_emission_per_ft.rename(columns={
    "Ewltp (g/km) sum": "Ewltp (g/km)",
    "Ewltp (g/km) size": "n",
})
total_emission_per_ft = total_emission_per_ft.sort_values(by="Ewltp (g/km)", ascending=False).reset_index()
total_emission_per_ft.to_csv("total_emission_per_ft.csv")

print(total_emission_per_ft)

# Plot settings
fig = px.bar(total_emission_per_ft,
    x="Ft",
    y="Ewltp (g/km)",
    title="Totale CO2-uitstoot van personenauto's in de EU per brandstof type (2021)",
    custom_data=[total_emission_per_ft["n"]],
    labels={
        "Ewltp (g/km)": "CO2 emissie in WLTP (g/km)",
        "Ft": "Brandstof type",
    }
)
fig.update_traces(hovertemplate="CO2 emissie in WLTP (g/km)=%{y}<br>Aantal auto's=%{customdata[0]}")
fig.show()


                Ft  Ewltp (g/km)        n
0           petrol   735097381.0  5480068
1           diesel   322874600.0  2237878
2  petrol-electric    32925459.0   832367
3              lpg    26000186.0   222429
4               ng     4100930.0    39123
5  diesel-electric     1984472.0    55304
6              e85     1150701.0     7210
7    ng-biomethane      862505.0     7630
8            other      220954.0     1801
9         electric           0.0   980563


# Links
Cleaned dataset: https://www.kaggle.com/datasets/yoshifu/co2-emission-from-cars-2015-2021

GitHub repository: https://github.com/FuYoshi/data_story_project

Original dataset: https://www.eea.europa.eu/data-and-maps/data/co2-cars-emission-18
