# Data Preprocessing

Student namen: Laiba Shamsul, Popke Snoek, Yoshi Fu, Pepeyn Velthuijse

Team nummer: G4

The following code is used to preprocess the datasets into files that are used for the data story. The kernel might die due to
the dataset being very large.

Note that it requires the dataset to be downloaded and inserted into the same folder as this notebook. You may find visualizations that did not make it into the actual data story. The dataset is too big to put on git, git lfs or for pandas.read_csv() to read using a share link on google drive/onedrive.

The entire cleaned dataset can be downloaded from: https://drive.google.com/file/d/1sTGPzVfk017Y8n2KPgsIxP9eXdl02YgH/view

If you want to see the code that made the cleaned dataset, then look at the 'clean.py' file in the github repository. https://github.com/FuYoshi/data_story_project


In [None]:
# Import packages
import pandas as pd


In [None]:
# Read the big dataset in chunks.
chunk_size = 1000000
dtypes = {"Country": str, "Mk": str, "Cn": str, "m (kg)": float, "Enedc (g/km)": float, "Ewltp (g/km)": float, "W (mm)": float, "Ft": str, "Ernedc (g/km)": float, "Erwltp (g/km)": float, "year": int}
chunk_container = pd.read_csv("CO2_data.csv", dtype=dtypes, chunksize=chunk_size)

# Split the dataset by year.
# This ensures that the kernel does not die due to memory shortage and some plots only require data of 2021.
co2_2015 = pd.DataFrame()
co2_2016 = pd.DataFrame()
co2_2017 = pd.DataFrame()
co2_2018 = pd.DataFrame()
co2_2019 = pd.DataFrame()
co2_2020 = pd.DataFrame()
co2_2021 = pd.DataFrame()
for chunk in chunk_container:
    co2_2015 = pd.concat([co2_2015, chunk[chunk["year"] == 2015]], ignore_index=True)
    co2_2016 = pd.concat([co2_2016, chunk[chunk["year"] == 2016]], ignore_index=True)
    co2_2017 = pd.concat([co2_2017, chunk[chunk["year"] == 2017]], ignore_index=True)
    co2_2018 = pd.concat([co2_2018, chunk[chunk["year"] == 2018]], ignore_index=True)
    co2_2019 = pd.concat([co2_2019, chunk[chunk["year"] == 2019]], ignore_index=True)
    co2_2020 = pd.concat([co2_2020, chunk[chunk["year"] == 2020]], ignore_index=True)
    co2_2021 = pd.concat([co2_2021, chunk[chunk["year"] == 2021]], ignore_index=True)
co2_2021.head()


In [None]:
# Set to true to convert data to csv. False otherwise.
to_csv = True

In [None]:
# Compute data for bar graph with average CO2 emission per country
country_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["Country", "year", "Ewltp (g/km)", "Erwltp (g/km)"]]
    # Group by country and year and compute mean and std.
    df = df.groupby(["Country", "year"]).agg({"Ewltp (g/km)": ['mean', 'sum'], "Erwltp (g/km)": 'mean'}).reset_index()
    df.columns = df.columns.map(' '.join).str.strip()
    country_emission = pd.concat([country_emission, df], ignore_index=True)

if to_csv:
    country_emission.to_csv("country_emission.csv")

country_emission.head()


In [None]:
# Lijst om de resultaten in op te slaan
counts_results = []

def classify_electric(x: str):
    if "-electric" in x:
        return "Semi-Elektrisch"
    elif "electric" in x:
        return "Elektrisch"
    else:
        return "Anders"

# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
    # Classificeer elektrische en hybride auto's als 'Elektrisch', en alle anderen als 'Anders'
    chunk['Ft'] = chunk['Ft'].apply(classify_electric)
    # chunk['Ft'] = chunk['Ft'].apply(lambda x: 'Elektrisch' if 'electric' in x or 'hybrid' in x else 'Anders')

    # Doe de berekeningen
    count_result = chunk.groupby(['year', 'Ft'], group_keys=False).size().reset_index(name='counts')
    # Voeg het resultaat toe aan de lijst
    counts_results.append(count_result)


In [None]:
# Concateneer alle resultaten in een dataframe
df_counts = pd.concat(counts_results)

# Splits bij jaar en brandstoftype
df_counts = df_counts.groupby(["year", "Ft"]).agg({"counts": 'sum'}).reset_index()

# Bereken totale emissie per jaar.
df_counts["sum"] = df_counts.groupby("year")["counts"].transform('sum')

# Bereken de percentages
df_counts["percent"] = 100 * df_counts["counts"] / df_counts["sum"]

if to_csv:
    df_counts.to_csv("fuel_type_distribution.csv")

df_counts.head()

In [None]:
# Compute total emission for each fuel type.
total_emission_per_ft = co2_2021.groupby('Ft').agg({"Ewltp (g/km)": ['sum', 'size']})
total_emission_per_ft.columns = total_emission_per_ft.columns.map(' '.join).str.strip()
total_emission_per_ft = total_emission_per_ft.rename(columns={
    "Ewltp (g/km) sum": "Ewltp (g/km)",
    "Ewltp (g/km) size": "n",
})
total_emission_per_ft = total_emission_per_ft.sort_values(by="Ewltp (g/km)", ascending=False).reset_index()

if to_csv:
    total_emission_per_ft.to_csv("total_emission_per_ft.csv")

total_emission_per_ft.head()


In [None]:
# Get a random sample of cars in 2021. Get their emission and mass.
# We take a random sample because there is so much data that the site becomes slow otherwise.
car_emission_mass = co2_2021.sample(n=100000, replace=False)
car_emission_mass = car_emission_mass[["Ft", "Ewltp (g/km)", "m (kg)"]]

if to_csv:
    car_emission_mass.to_csv("car_emission_mass.csv")

car_emission_mass.head()


In [None]:
# Get the emission per car brand
bycarbrand_2015 = co2_2015[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2016 = co2_2016[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2017 = co2_2017[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2018 = co2_2018[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2019 = co2_2019[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2020 = co2_2020[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2021 = co2_2021[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]

frames = [bycarbrand_2015, bycarbrand_2016, bycarbrand_2017, bycarbrand_2018, bycarbrand_2019, bycarbrand_2020, bycarbrand_2021]
result = pd.concat(frames)

# Compute data for bar graph with average CO2 emission per country
car_emission = pd.DataFrame()
for df in [result]:
    # Group by country and year and compute mean and std.
    df = df.groupby(["Mk", "year", "Ft"]).agg({"Ewltp (g/km)": ['mean']}).reset_index()
    # Change mutliindex to single index.
    df.columns = df.columns.map(' '.join).str.strip()
    car_emission = pd.concat([car_emission, df], ignore_index=True)

if to_csv:
    car_emission.to_csv("car_emission.csv")

car_emission.head()