# Data Preprocessing

Student namen: Laiba Shamsul, Popke Snoek, Yoshi Fu, Pepeyn Velthuijse

Team nummer: G4

Hieronder is de code weergegeven dat is gebruikt om de gefilterde dataset te verwerken naar bestanden die in de data story worden gebruikt.

We verwijzen naar de GitHub repository om de code te zien waarmee de verschillende originele datasets zijn samengevoegd. De code is te vinden in het Python bestand 'clean.py'. https://github.com/FuYoshi/data_story_project


In [1]:
# Import packages
import pandas as pd


In [2]:
# Read the big dataset in chunks.
chunk_size = 1000000
dtypes = {"Country": str, "Mk": str, "Cn": str, "m (kg)": float, "Enedc (g/km)": float, "Ewltp (g/km)": float, "W (mm)": float, "Ft": str, "Ernedc (g/km)": float, "Erwltp (g/km)": float, "year": int}
chunk_container = pd.read_csv("CO2_data.csv", dtype=dtypes, chunksize=chunk_size)

# Split the dataset by year.
# This ensures that the kernel does not die due to memory shortage and some plots only require data of 2021.
co2_2015 = pd.DataFrame()
co2_2016 = pd.DataFrame()
co2_2017 = pd.DataFrame()
co2_2018 = pd.DataFrame()
co2_2019 = pd.DataFrame()
co2_2020 = pd.DataFrame()
co2_2021 = pd.DataFrame()
for chunk in chunk_container:
    co2_2015 = pd.concat([co2_2015, chunk[chunk["year"] == 2015]], ignore_index=True)
    co2_2016 = pd.concat([co2_2016, chunk[chunk["year"] == 2016]], ignore_index=True)
    co2_2017 = pd.concat([co2_2017, chunk[chunk["year"] == 2017]], ignore_index=True)
    co2_2018 = pd.concat([co2_2018, chunk[chunk["year"] == 2018]], ignore_index=True)
    co2_2019 = pd.concat([co2_2019, chunk[chunk["year"] == 2019]], ignore_index=True)
    co2_2020 = pd.concat([co2_2020, chunk[chunk["year"] == 2020]], ignore_index=True)
    co2_2021 = pd.concat([co2_2021, chunk[chunk["year"] == 2021]], ignore_index=True)
co2_2021.head()


Unnamed: 0,Cn,Country,Enedc (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Ft,Mk,W (mm),m (kg),year
0,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
1,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
2,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
3,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021
4,MITSUBISHI OUTLANDER,IS,40.0,,,46.0,petrol-electric,Mitsubishi,2670.0,1965.0,2021


In [3]:
# Set to true to convert data to csv. False otherwise.
to_csv = True

In [4]:
# Compute data for bar graph with average CO2 emission per country
country_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["Country", "year", "Ewltp (g/km)", "Erwltp (g/km)"]]
    # Group by country and year and compute mean and std.
    df = df.groupby(["Country", "year"]).agg({"Ewltp (g/km)": ['mean', 'sum'], "Erwltp (g/km)": 'mean'}).reset_index()
    df.columns = df.columns.map(' '.join).str.strip()
    country_emission = pd.concat([country_emission, df], ignore_index=True)

if to_csv:
    country_emission.to_csv("country_emission.csv")

country_emission.head()


Unnamed: 0,Country,year,Ewltp (g/km) mean,Ewltp (g/km) sum,Erwltp (g/km) mean
0,AT,2015,167.352565,9610053.7,1.740408
1,BE,2015,168.882209,4642909.7,2.788953
2,BG,2015,179.490853,665193.1,3.076667
3,CY,2015,173.255686,251394.0,
4,CZ,2015,182.923737,2269534.8,1.70777


In [5]:
# Lijst om de resultaten in op te slaan
counts_results = []

def classify_electric(x: str):
    if "-electric" in x:
        return "Semi-Elektrisch"
    elif "electric" in x:
        return "Elektrisch"
    else:
        return "Anders"

# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
    # Classificeer elektrische en hybride auto's als 'Elektrisch', en alle anderen als 'Anders'
    chunk['Ft'] = chunk['Ft'].apply(classify_electric)
    # chunk['Ft'] = chunk['Ft'].apply(lambda x: 'Elektrisch' if 'electric' in x or 'hybrid' in x else 'Anders')

    # Doe de berekeningen
    count_result = chunk.groupby(['year', 'Ft'], group_keys=False).size().reset_index(name='counts')
    # Voeg het resultaat toe aan de lijst
    counts_results.append(count_result)


In [6]:
# Concateneer alle resultaten in een dataframe
df_counts = pd.concat(counts_results)

# Splits bij jaar en brandstoftype
df_counts = df_counts.groupby(["year", "Ft"]).agg({"counts": 'sum'}).reset_index()

# Bereken totale emissie per jaar.
df_counts["sum"] = df_counts.groupby("year")["counts"].transform('sum')

# Bereken de percentages
df_counts["percent"] = 100 * df_counts["counts"] / df_counts["sum"]

if to_csv:
    df_counts.to_csv("fuel_type_distribution.csv")

df_counts.head()

Unnamed: 0,year,Ft,counts,sum,percent
0,2015,Anders,415321,419369,99.03474
1,2015,Elektrisch,1899,419369,0.452823
2,2015,Semi-Elektrisch,2149,419369,0.512437
3,2016,Anders,473519,478728,98.911908
4,2016,Elektrisch,2452,478728,0.512191


In [7]:
# Compute total emission for each fuel type.
total_emission_per_ft = co2_2021.groupby('Ft').agg({"Ewltp (g/km)": ['sum', 'size']})
total_emission_per_ft.columns = total_emission_per_ft.columns.map(' '.join).str.strip()
total_emission_per_ft = total_emission_per_ft.rename(columns={
    "Ewltp (g/km) sum": "Ewltp (g/km)",
    "Ewltp (g/km) size": "n",
})
total_emission_per_ft = total_emission_per_ft.sort_values(by="Ewltp (g/km)", ascending=False).reset_index()

if to_csv:
    total_emission_per_ft.to_csv("total_emission_per_ft.csv")

total_emission_per_ft.head()


Unnamed: 0,Ft,Ewltp (g/km),n
0,petrol,735257118.5,5469354
1,diesel,322991801.5,2235397
2,petrol-electric,32944236.2,786478
3,lpg,26007982.1,214743
4,ng,4101197.8,39088


In [8]:
# Get a random sample of cars in 2021. Get their emission and mass.
# We take a random sample because there is so much data that the site becomes slow otherwise.
car_emission_mass = co2_2021.sample(n=100000, replace=False)
car_emission_mass = car_emission_mass[["Ft", "Ewltp (g/km)", "m (kg)"]]

if to_csv:
    car_emission_mass.to_csv("car_emission_mass.csv")

car_emission_mass.head()


Unnamed: 0,Ft,Ewltp (g/km),m (kg)
119630,diesel,128.0,1576.0
7062371,petrol,123.0,1280.0
1505641,petrol,110.0,915.0
422397,diesel,188.0,2165.0
361343,petrol,131.0,1325.0


In [9]:
# Get the emission per car brand
bycarbrand_2015 = co2_2015[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2016 = co2_2016[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2017 = co2_2017[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2018 = co2_2018[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2019 = co2_2019[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2020 = co2_2020[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]
bycarbrand_2021 = co2_2021[['Mk','Cn', 'Ewltp (g/km)', 'year', 'Ft']]

frames = [bycarbrand_2015, bycarbrand_2016, bycarbrand_2017, bycarbrand_2018, bycarbrand_2019, bycarbrand_2020, bycarbrand_2021]
result = pd.concat(frames)

# Compute data for bar graph with average CO2 emission per country
car_emission = pd.DataFrame()
for df in [result]:
    # Group by country and year and compute mean and std.
    df = df.groupby(["Mk", "year", "Ft"]).agg({"Ewltp (g/km)": ['mean']}).reset_index()
    # Change mutliindex to single index.
    df.columns = df.columns.map(' '.join).str.strip()
    car_emission = pd.concat([car_emission, df], ignore_index=True)

if to_csv:
    car_emission.to_csv("car_emission.csv")

car_emission.head()

Unnamed: 0,Mk,year,Ft,Ewltp (g/km) mean
0,Aiways,2019,electric,0.0
1,Aiways,2020,electric,0.0
2,Aiways,2021,electric,0.0
3,Alfa Romeo,2015,diesel,137.995311
4,Alfa Romeo,2015,lpg,171.486957
