In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
datapath = "./data/"

In [28]:
df = pd.read_csv("Fuel_Consumption_2000-2022.csv") # https://www.kaggle.com/datasets/ahmettyilmazz/fuel-consumption
df_prices = pd.read_csv("Fuel_prices_Canada.csv") # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1810000101

In [29]:

def subset_price_df(df_prices):
    # Only keep the date, type of fuel and cost
    df_prices = df_prices[["REF_DATE","Type of fuel", "VALUE"]]

    # Only keep values in the interval 2000-2022
    df_prices['REF_DATE'] = pd.to_datetime(df_prices['REF_DATE'], format='%Y-%m')
    # df_prices.loc[:, 'REF_DATE'] = pd.to_datetime(df_prices['REF_DATE'], format='%Y-%m')

    df_prices = df_prices.loc[(df_prices['REF_DATE'] >= '2000-01')
                        & (df_prices['REF_DATE'] < '2023-01')]

    # Only keep the values from self-service stations
    df_prices = df_prices[df_prices['Type of fuel'].str.contains('self service')]
    return df_prices

def reformat_price_df(df):
    df['REF_DATE'] = df['REF_DATE'].dt.year
    replacement_dict = {
        'Diesel fuel at self service filling stations': 'D',
        'Premium unleaded gasoline at self service filling stations': 'Z',
        'Regular unleaded gasoline at self service filling stations': 'X'
    }
    df['Type of fuel'] = df['Type of fuel'].map(replacement_dict)

    df = df.rename(columns={"REF_DATE":"YEAR", "Type of fuel":"FUEL", "VALUE":"FUEL COST"})
    return df

def aggregate_price_df(df):
    return df.groupby(['FUEL','YEAR']).mean()


Preprocessing

In [30]:
# Drop records with NAs
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna()

# Drop unused columns
def remove_features(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(["CYLINDERS", "TRANSMISSION", "HWY (L/100 km)", "COMB (L/100 km)", "COMB (mpg)"], axis=1)

# Remove subsections of classes
def remove_class_subsections(df: pd.DataFrame) -> pd.DataFrame:
    df['VEHICLE CLASS'] = df['VEHICLE CLASS'].str.split(': ').str[0]
    df['VEHICLE CLASS'] = df['VEHICLE CLASS'].str.split(' - ').str[0]
    df['VEHICLE CLASS'] = df['VEHICLE CLASS'].str.upper()
    return df
def capitalize_make_column(df:pd.DataFrame) -> pd.DataFrame:
    df['MAKE'] = df['MAKE'].str.upper()
    return df
def normalize_model_column(df:pd.DataFrame) ->pd.DataFrame:
    df['MODEL'] = df['MODEL'].str.lower()

    df['MODEL'] = df['MODEL'].str.capitalize()

# Subset price dataset to match vehicle data
def subset_price_df(df_prices:pd.DataFrame) -> pd.DataFrame:
    # Only keep the date, type of fuel and cost
    df_prices = df_prices[["REF_DATE","Type of fuel", "VALUE"]]

    # Only keep values in the interval 2000-2022
    df_prices['REF_DATE'] = pd.to_datetime(df_prices['REF_DATE'], format='%Y-%m')
    # df_prices.loc[:, 'REF_DATE'] = pd.to_datetime(df_prices['REF_DATE'], format='%Y-%m')

    df_prices = df_prices.loc[(df_prices['REF_DATE'] >= '2000-01')
                        & (df_prices['REF_DATE'] < '2023-01')]

    # Only keep the values from self-service stations
    df_prices = df_prices[df_prices['Type of fuel'].str.contains('self service')]
    return df_prices

# Reformat price dataframe to match vehicle dataframe
def reformat_price_df(df:pd.DataFrame) -> pd.DataFrame:
    df['REF_DATE'] = df['REF_DATE'].dt.year
    replacement_dict = {
        'Diesel fuel at self service filling stations': 'D',
        'Premium unleaded gasoline at self service filling stations': 'Z',
        'Regular unleaded gasoline at self service filling stations': 'X'
    }
    df['Type of fuel'] = df['Type of fuel'].map(replacement_dict)

    df = df.rename(columns={"REF_DATE":"YEAR", "Type of fuel":"FUEL", "VALUE":"FUEL COST"})
    return df

# Aggregate price dataframe based on fuel type and year
def aggregate_price_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby(['FUEL','YEAR']).mean()




In [31]:
def preprocess_data(df: pd.DataFrame):
    return(
        df
        .pipe(clean_dataset)
        .pipe(remove_features)
        .pipe(remove_class_subsections)
        .pipe(capitalize_make_column)
        # .pipe(normalize_model_column)
    )

def preprocess_price_data(df: pd.DataFrame):
    return(
        df
        .pipe(subset_price_df)
        .pipe(reformat_price_df)
        .pipe(aggregate_price_df)
    )

df = preprocess_data(df)
df_prices = preprocess_price_data(df_prices)

df_merged = df.merge(df_prices, on=['FUEL', 'YEAR'], how='left')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prices['REF_DATE'] = pd.to_datetime(df_prices['REF_DATE'], format='%Y-%m')


In [32]:
# df_merged.drop_duplicates(subset=['MODEL'], inplace=True)
df_merged['MODEL'] = df_merged['MODEL'].str.upper()
df_merged[df_merged['MODEL']=='AVENTADOR ROADSTER']
df_merged.loc[df_merged['VEHICLE CLASS'].isin(['MINICOMPACT', 'SUBCOMPACT']), 'VEHICLE CLASS'] = 'COMPACT'
df_merged['ENGINE SIZE'] = df_merged['ENGINE SIZE'].astype(str)

In [33]:
df_merged.to_json('data.json', index=False, orient="records")

In [34]:
sns.color_palette("Paired", 12)
