In [1]:
catalog.list()


[1m[[0m
    [32m'companies'[0m,
    [32m'reviews'[0m,
    [32m'shuttles'[0m,
    [32m'preprocessed_companies'[0m,
    [32m'preprocessed_shuttles'[0m,
    [32m'model_input_table'[0m,
    [32m'regressor'[0m,
    [32m'scope3_data_3'[0m,
    [32m'scope3_data_0'[0m,
    [32m'preprocessed_scope3'[0m,
    [32m'parameters'[0m,
    [32m'params:feature_options'[0m,
    [32m'params:feature_options.features'[0m,
    [32m'params:model_options'[0m,
    [32m'params:model_options.test_size'[0m,
    [32m'params:model_options.random_state'[0m,
    [32m'params:model_options.features'[0m
[1m][0m

In [2]:
import numpy as np
import pandas as pd
from typing import Dict
from sklearn.preprocessing import StandardScaler

In [3]:
df = catalog.load("scope3_data_3")

In [4]:
features = catalog.load("params:feature_options")

In [5]:
#pipelines

In [6]:
#%run_viz

## Handle missing data

In [7]:
def _remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove all rows with missing values in a pandas dataframe.

    Args:
        df (pd.DataFrame): Input pandas DataFrame

    Returns:
        pd.DataFrame: Output DataFrame with rows containing missing values removed.
    """

    df_cleaned = df.dropna()

    return df_cleaned

In [8]:
def preprocess_scope3(scope3_data: pd.DataFrame, parameters: Dict) -> pd.DataFrame:
    """Preprocesses the Scope 3 data.

    Args:
        scope3_data: Raw data.
        
    Returns:
        Preprocessed data, with missing values removed.
    """
    
    df = scope3_data[parameters["features"]]
    df = _remove_missing_values(df)
    preprocessed_data = df
    
    return preprocessed_data

In [9]:
preprocessed_df = preprocess_scope3(df, features)

## Feature engineering

In [10]:
def _remap_countries(df: pd.DataFrame) -> pd.DataFrame:
    # Define countries to keep based on a threshold of 50
    countries_to_keep = df['Country'].value_counts()[df['Country'].value_counts() > 50].index

    # Remap countries with counts less than 50 to 'Other'
    df['Country'] = df['Country'].apply(lambda x: x if x in countries_to_keep else 'Other')
    
    return df

In [11]:
def _remap_industry(df: pd.DataFrame) -> pd.DataFrame:
    # Define industries to keep based on a threshold of 50
    industries_to_keep = df['Industry (Exiobase)'].value_counts()[df['Industry (Exiobase)'].value_counts() > 50].index

    # Remap industries with counts less than 50 to 'Other'
    df['Industry (Exiobase)'] = df['Industry (Exiobase)'].apply(lambda x: x if x in industries_to_keep else 'Other')
    
    return df

In [12]:
def _create_interaction_terms(df: pd.DataFrame) -> pd.DataFrame:
    # Creating interaction terms
    df['Use_Sold_x_Processing'] = df['Use of Sold Products'] * df['Processing of Sold Products']
    df['Use_Sold_x_Purchased'] = df['Use of Sold Products'] * df['Purchased Goods and Services']
    df['Processing_x_Purchased'] = df['Processing of Sold Products'] * df['Purchased Goods and Services']
    df['Purchased_x_End_of_Life'] = df['Purchased Goods and Services'] * df['End of Life Treatment of Sold Products']
    
    return df

In [13]:
def _create_polynomial_features(df: pd.DataFrame) -> pd.DataFrame:
    # Creating polynomial features (squared)
    df['Use_Sold_Squared'] = df['Use of Sold Products']**2
    df['Processing_Squared'] = df['Processing of Sold Products']**2
    df['Purchased_Squared'] = df['Purchased Goods and Services']**2
    df['End_of_Life_Squared'] = df['End of Life Treatment of Sold Products']**2
    
    return df

## Encoding Categorical Variables

In [14]:
def _one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encode 'Country' and 'Industry (Exiobase)' columns
    df_encoded = pd.get_dummies(df, columns=['Country', 'Industry (Exiobase)'])
    return df_encoded

## Normalization/Standardization 

In [21]:
def _normalization(df: pd.DataFrame) -> pd.DataFrame:
    # Create the scaler
    scaler_standard = StandardScaler()

    # Fit the scaler to the data (excluding categorical data if not already encoded)
    df_normalized_standard = pd.DataFrame(scaler_standard.fit_transform(df), columns=df.columns)
    
    return df_normalized_standard

In [22]:
def feature_engineering(scope3_data: pd.DataFrame) -> pd.DataFrame:
    df = _remap_countries(scope3_data)
    df = _remap_industry(df)
    df = _create_interaction_terms(df)
    df = _create_polynomial_features(df)
    df = _one_hot_encode(df)
    df = _normalization(df)
    df_feature_engineered = df
    
    return df_feature_engineered

In [23]:
df_feature_engineered = feature_engineering(preprocessed_df)

In [24]:
df_feature_engineered

Unnamed: 0,Business Travel,Capital Goods,Downstream Leased Assets,Downstream Transportation and Distribution,Employee Commuting,End of Life Treatment of Sold Products,Franchises,Fuel-and-energy-related activities (not included in Scope 1 or 2),Investments,Processing of Sold Products,...,"Industry (Exiobase)_Publishing, printing and reproduction of recorded media (22)",Industry (Exiobase)_Quarrying of sand and clay,Industry (Exiobase)_Real estate activities (70),"Industry (Exiobase)_Recreational, cultural and sporting activities (92)",Industry (Exiobase)_Renting of machinery and equipment without operator and of personal and household goods (71),Industry (Exiobase)_Research and development (73),"Industry (Exiobase)_Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)",Industry (Exiobase)_Sea and coastal water transport,Industry (Exiobase)_Transport via railways,"Industry (Exiobase)_Wholesale trade and commission trade, except of motor vehicles and motorcycles (51)"
0,0.351674,0.285463,0.089172,0.193420,0.258834,0.190253,0.076924,0.125468,0.043358,0.093035,...,-0.13141,-0.171665,-0.157438,13.002177,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
1,0.343210,0.285606,0.088254,0.190489,0.260545,0.190324,0.076924,0.125465,0.043358,0.094346,...,-0.13141,-0.171665,-0.157438,13.002177,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
2,0.353495,0.250433,0.081238,0.119891,0.236674,0.189058,0.076924,0.122606,0.043358,0.049566,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
3,0.345998,-0.242712,0.084694,0.191685,0.247654,0.190324,0.076924,0.122932,0.036937,0.090888,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
4,0.163299,0.257358,0.086375,0.183810,0.233668,0.177002,0.076924,0.123387,0.043358,0.091751,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,0.371776,0.277918,0.088254,0.175243,0.139234,0.188450,0.076924,0.123530,0.043358,0.094346,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,4.331899,-0.08523,-0.099297,-0.078357
9009,0.366116,0.271425,0.087859,0.065963,0.117874,0.176952,0.076924,0.048045,0.043358,0.066135,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
9010,0.377083,0.246532,0.061120,0.191431,0.222800,0.190815,0.076924,0.125353,0.043358,0.089885,...,-0.13141,5.825299,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
9011,0.366818,0.272850,0.087764,0.177369,0.256104,0.167414,0.076924,0.123390,0.043358,0.081429,...,-0.13141,5.825299,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
