# Jupyter Notebook for cleaning the complete dataset

The objective of this notebook is to clean the dataset containing all the registred cars in order to use it for different pourposes. This will be done by applying the corresponding filters and casting the data to they propper formats

## Read and filter the file

In [None]:
import polars as pl
import os

In [4]:
path = os.path.join("..","Data", "DGT")
complete_park = os.path.join(path,'Parque_exacto','mat_2023.txt')

Cast columns to their correct type and perform the necessary filtrations

In [None]:
from dictionaries import types_parque
from dictionaries import sub_tipo_mapping

parque = pl.scan_csv(complete_park,
            separator='|',
            schema=types_parque)

parque = parque.filter(pl.col("FECHA_MATR") >= pl.col("FECHA_PRIM_MATR")).with_columns(
    pl.col('POTENCIA').str.replace(",", ".").cast(pl.Float64),
    pl.col('PROPULSION').str.replace(",", ".").cast(pl.Float64),
    pl.col('SUBTIPO_DGT').replace(sub_tipo_mapping).alias("SUBTIPO_DGT"),
    pl.col('AUTONOMIA').str.replace(" ", "").str.replace(",", ".").str.replace("0000Û0","000000").cast(pl.Float64))

park = parque.collect()

In [16]:
remolques =  list(sub_tipo_mapping.values())

park = park.with_columns(
    pl.when((pl.col("SUBTIPO_DGT").is_in(remolques)) |
            (pl.col('CLASE_MATR') == "Remolque"))
    .then(pl.lit("NO_EMISIONS"))
    .otherwise(pl.col("EMISIONES_EURO"))
    .alias("EMISIONES_EURO"))

inverse_sub_tipo_mapping = {v: k for k, v in sub_tipo_mapping.items()}

park = park.with_columns(
    pl.col('SUBTIPO_DGT').replace(inverse_sub_tipo_mapping).alias("SUBTIPO_DGT"))

## Write the file

In [18]:
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')
park.write_csv(clean_park)