In [1]:
catalog.list()


[1m[[0m
    [32m'companies'[0m,
    [32m'reviews'[0m,
    [32m'shuttles'[0m,
    [32m'preprocessed_companies'[0m,
    [32m'preprocessed_shuttles'[0m,
    [32m'model_input_table'[0m,
    [32m'regressor'[0m,
    [32m'scope3_data_3'[0m,
    [32m'scope3_data_0'[0m,
    [32m'preprocessed_scope3'[0m,
    [32m'parameters'[0m,
    [32m'params:feature_options'[0m,
    [32m'params:feature_options.features'[0m,
    [32m'params:model_options'[0m,
    [32m'params:model_options.test_size'[0m,
    [32m'params:model_options.random_state'[0m,
    [32m'params:model_options.features'[0m
[1m][0m

In [4]:
import numpy as np
import pandas as pd
from typing import Dict

In [5]:
df = catalog.load("scope3_data_3")

In [6]:
features = catalog.load("params:feature_options")

In [5]:
#pipelines

In [6]:
#%run_viz

In [7]:
def _remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove all rows with missing values in a pandas dataframe.

    Args:
        df (pd.DataFrame): Input pandas DataFrame

    Returns:
        pd.DataFrame: Output DataFrame with rows containing missing values removed.
    """

    df_cleaned = df.dropna()

    return df_cleaned

In [8]:
def preprocess_scope3(scope3_data: pd.DataFrame, parameters: Dict) -> pd.DataFrame:
    """Preprocesses the Scope 3 data.

    Args:
        scope3_data: Raw data.
        
    Returns:
        Preprocessed data, with missing values removed.
    """
    
    df = scope3_data[parameters["features"]]
    preprocessed_data = _remove_missing_values(df)
    return preprocessed_data

In [9]:
preprocessed_df = preprocess_scope3(df, features)

In [10]:
def _remap_countries(df: pd.DataFrame) -> pd.DataFrame:
    # Define countries to keep based on a threshold of 50
    countries_to_keep = df['Country'].value_counts()[df['Country'].value_counts() > 50].index

    # Remap countries with counts less than 50 to 'Other'
    df['Country'] = df['Country'].apply(lambda x: x if x in countries_to_keep else 'Other')
    
    return df
    

In [14]:
def feature_engineering(scope3_data: pd.DataFrame) -> pd.DataFrame:
    df_feature_engineered = _remap_countries(scope3_data)
    
    return df_feature_engineered
    

In [16]:
df_feature_engineered = feature_engineering(preprocessed_df)

In [17]:
df_feature_engineered

Unnamed: 0,Country,Industry (Exiobase),Business Travel,Capital Goods,Downstream Leased Assets,Downstream Transportation and Distribution,Employee Commuting,End of Life Treatment of Sold Products,Franchises,Fuel-and-energy-related activities (not included in Scope 1 or 2),Investments,Processing of Sold Products,Purchased Goods and Services,Upstream Leased Assets,Upstream Transportation and Distribution,Use of Sold Products,Waste Generated in Operations
0,AUSTRALIA,"Recreational, cultural and sporting activities...",-2.496564e+05,-4.757744e+04,-3.443104e+04,-2.998176e+04,-7.278893e+04,-1.405697e+05,0.0,-1.005045e+04,0.000000e+00,-3.800732e+06,-4.032700e+04,-115870.418895,-1.622937e+05,-2.517427e+05,-1.680007e+04
1,AUSTRALIA,"Recreational, cultural and sporting activities...",-3.122127e+05,-3.505706e+04,-1.100655e+05,-5.560756e+05,-4.256929e+04,-1.260005e+05,0.0,-1.206509e+04,0.000000e+00,-7.884424e+05,-9.811424e+04,-135220.095921,-3.835789e+05,-4.218799e+06,-8.513858e+04
2,UNITED STATES,Manufacture of basic iron and steel and of fer...,-2.361935e+05,-3.104402e+06,-6.883932e+05,-1.322733e+07,-4.641145e+05,-3.834651e+05,0.0,-2.235343e+06,0.000000e+00,-1.037129e+08,-3.317590e+07,-164244.611459,-4.439044e+06,-3.844370e+08,-8.435292e+07
3,AUSTRALIA,Other land transport,-2.916110e+05,-4.613737e+07,-4.035421e+05,-3.414649e+05,-2.702125e+05,-1.260005e+05,0.0,-1.981635e+06,-5.298055e+06,-8.735469e+06,-1.839437e+07,-223065.353185,-9.207954e+05,-5.275860e+04,-2.550743e+05
4,AUSTRALIA,Extraction of crude petroleum and services rel...,-1.641991e+06,-2.500092e+06,-2.649768e+05,-1.754950e+06,-5.172055e+05,-2.835297e+06,0.0,-1.627649e+06,0.000000e+00,-6.753308e+06,-7.203999e+06,-113144.390867,-2.048789e+06,-4.953517e+07,-4.679207e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,SOUTH AFRICA,"Retail trade, except of motor vehicles and mot...",-1.010736e+05,-7.059217e+05,-1.100655e+05,-3.292632e+06,-2.184806e+06,-5.071892e+05,0.0,-1.516787e+06,0.000000e+00,-7.884424e+05,-1.724398e+06,-222065.999277,-5.627341e+05,-1.586013e+07,-8.513858e+04
9009,SOUTH AFRICA,Manufacture of basic iron and steel and of fer...,-1.429076e+05,-1.272571e+06,-1.426185e+05,-2.290660e+07,-2.562011e+06,-2.845541e+06,0.0,-6.021346e+07,0.000000e+00,-6.562955e+07,-3.528015e+07,-335318.520693,-1.596656e+07,-2.102340e+08,-1.083696e+06
9010,SOUTH AFRICA,Quarrying of sand and clay,-6.184726e+04,-3.444812e+06,-2.346714e+06,-3.869935e+05,-7.091088e+05,-2.617897e+04,0.0,-9.948010e+04,0.000000e+00,-1.104070e+07,-1.024349e+05,-72067.302974,-3.921155e+05,-2.190611e+05,-1.214363e+06
9011,SOUTH AFRICA,Quarrying of sand and clay,-1.377242e+05,-1.148233e+06,-1.504722e+05,-2.910874e+06,-1.210038e+05,-4.785289e+06,0.0,-1.625373e+06,0.000000e+00,-3.047654e+07,-6.031294e+06,-153659.199910,-8.957421e+05,-3.301965e+08,-8.637923e+04


In [None]:
df_feature_engineered