In [1]:
catalog.list()


[1m[[0m
    [32m'companies'[0m,
    [32m'reviews'[0m,
    [32m'shuttles'[0m,
    [32m'preprocessed_companies'[0m,
    [32m'preprocessed_shuttles'[0m,
    [32m'model_input_table'[0m,
    [32m'regressor'[0m,
    [32m'scope3_data_3'[0m,
    [32m'scope3_data_0'[0m,
    [32m'preprocessed_scope3'[0m,
    [32m'parameters'[0m,
    [32m'params:feature_options'[0m,
    [32m'params:feature_options.features'[0m,
    [32m'params:model_options'[0m,
    [32m'params:model_options.test_size'[0m,
    [32m'params:model_options.random_state'[0m,
    [32m'params:model_options.features'[0m
[1m][0m

In [2]:
import numpy as np
import pandas as pd
from typing import Dict
from sklearn.preprocessing import StandardScaler

In [3]:
df = catalog.load("scope3_data_3")

In [4]:
features = catalog.load("params:feature_options")

In [5]:
features


[1m{[0m
    [32m'features'[0m: [1m[[0m
        [32m'Industry [0m[32m([0m[32mExiobase[0m[32m)[0m[32m'[0m,
        [32m'Business Travel'[0m,
        [32m'Capital Goods'[0m,
        [32m'Downstream Leased Assets'[0m,
        [32m'Downstream Transportation and Distribution'[0m,
        [32m'Employee Commuting'[0m,
        [32m'End of Life Treatment of Sold Products'[0m,
        [32m'Franchises'[0m,
        [32m'Fuel-and-energy-related activities [0m[32m([0m[32mnot included in Scope 1 or 2[0m[32m)[0m[32m'[0m,
        [32m'Investments'[0m,
        [32m'Processing of Sold Products'[0m,
        [32m'Purchased Goods and Services'[0m,
        [32m'Upstream Leased Assets'[0m,
        [32m'Upstream Transportation and Distribution'[0m,
        [32m'Use of Sold Products'[0m,
        [32m'Waste Generated in Operations'[0m,
        [32m'Scope 3'[0m
    [1m][0m
[1m}[0m

In [6]:
#pipelines

In [7]:
#%run_viz

## Handle missing data

In [8]:
def _remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove all rows with missing values in a pandas dataframe.

    Args:
        df (pd.DataFrame): Input pandas DataFrame

    Returns:
        pd.DataFrame: Output DataFrame with rows containing missing values removed.
    """

    df_cleaned = df.dropna()

    return df_cleaned

In [9]:
def preprocess_scope3(scope3_data: pd.DataFrame, parameters: Dict) -> pd.DataFrame:
    """Preprocesses the Scope 3 data.

    Args:
        scope3_data: Raw data.
        
    Returns:
        Preprocessed data, with missing values removed.
    """
    
    df = scope3_data[parameters["features"]]
    df = _remove_missing_values(df)
    preprocessed_data = df
    
    return preprocessed_data

In [10]:
preprocessed_df = preprocess_scope3(df, features)

In [11]:
preprocessed_df

Unnamed: 0,Industry (Exiobase),Business Travel,Capital Goods,Downstream Leased Assets,Downstream Transportation and Distribution,Employee Commuting,End of Life Treatment of Sold Products,Franchises,Fuel-and-energy-related activities (not included in Scope 1 or 2),Investments,Processing of Sold Products,Purchased Goods and Services,Upstream Leased Assets,Upstream Transportation and Distribution,Use of Sold Products,Waste Generated in Operations,Scope 3
0,"Recreational, cultural and sporting activities...",-2.496564e+05,-4.757744e+04,-3.443104e+04,-2.998176e+04,-7.278893e+04,-1.405697e+05,0.0,-1.005045e+04,0.000000e+00,-3.800732e+06,-4.032700e+04,-115870.418895,-1.622937e+05,-2.517427e+05,-1.680007e+04,-4.972821e+06
1,"Recreational, cultural and sporting activities...",-3.122127e+05,-3.505706e+04,-1.100655e+05,-5.560756e+05,-4.256929e+04,-1.260005e+05,0.0,-1.206509e+04,0.000000e+00,-7.884424e+05,-9.811424e+04,-135220.095921,-3.835789e+05,-4.218799e+06,-8.513858e+04,-6.903339e+06
2,Manufacture of basic iron and steel and of fer...,-2.361935e+05,-3.104402e+06,-6.883932e+05,-1.322733e+07,-4.641145e+05,-3.834651e+05,0.0,-2.235343e+06,0.000000e+00,-1.037129e+08,-3.317590e+07,-164244.611459,-4.439044e+06,-3.844370e+08,-8.435292e+07,-6.306213e+08
3,Other land transport,-2.916110e+05,-4.613737e+07,-4.035421e+05,-3.414649e+05,-2.702125e+05,-1.260005e+05,0.0,-1.981635e+06,-5.298055e+06,-8.735469e+06,-1.839437e+07,-223065.353185,-9.207954e+05,-5.275860e+04,-2.550743e+05,-8.343143e+07
4,Extraction of crude petroleum and services rel...,-1.641991e+06,-2.500092e+06,-2.649768e+05,-1.754950e+06,-5.172055e+05,-2.835297e+06,0.0,-1.627649e+06,0.000000e+00,-6.753308e+06,-7.203999e+06,-113144.390867,-2.048789e+06,-4.953517e+07,-4.679207e+05,-7.726450e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,"Retail trade, except of motor vehicles and mot...",-1.010736e+05,-7.059217e+05,-1.100655e+05,-3.292632e+06,-2.184806e+06,-5.071892e+05,0.0,-1.516787e+06,0.000000e+00,-7.884424e+05,-1.724398e+06,-222065.999277,-5.627341e+05,-1.586013e+07,-8.513858e+04,-2.766139e+07
9009,Manufacture of basic iron and steel and of fer...,-1.429076e+05,-1.272571e+06,-1.426185e+05,-2.290660e+07,-2.562011e+06,-2.845541e+06,0.0,-6.021346e+07,0.000000e+00,-6.562955e+07,-3.528015e+07,-335318.520693,-1.596656e+07,-2.102340e+08,-1.083696e+06,-4.186150e+08
9010,Quarrying of sand and clay,-6.184726e+04,-3.444812e+06,-2.346714e+06,-3.869935e+05,-7.091088e+05,-2.617897e+04,0.0,-9.948010e+04,0.000000e+00,-1.104070e+07,-1.024349e+05,-72067.302974,-3.921155e+05,-2.190611e+05,-1.214363e+06,-2.011587e+07
9011,Quarrying of sand and clay,-1.377242e+05,-1.148233e+06,-1.504722e+05,-2.910874e+06,-1.210038e+05,-4.785289e+06,0.0,-1.625373e+06,0.000000e+00,-3.047654e+07,-6.031294e+06,-153659.199910,-8.957421e+05,-3.301965e+08,-8.637923e+04,-3.787191e+08


## Feature engineering

In [12]:
# def _remap_countries(df: pd.DataFrame) -> pd.DataFrame:
#     # Define countries to keep based on a threshold of 50
#     countries_to_keep = df['Country'].value_counts()[df['Country'].value_counts() > 50].index

#     # Remap countries with counts less than 50 to 'Other'
#     df['Country'] = df['Country'].apply(lambda x: x if x in countries_to_keep else 'Other')
    
#     return df

In [13]:
def _remap_industry(df: pd.DataFrame) -> pd.DataFrame:
    # Define industries to keep based on a threshold of 50
    industries_to_keep = df['Industry (Exiobase)'].value_counts()[df['Industry (Exiobase)'].value_counts() > 50].index

    # Remap industries with counts less than 50 to 'Other'
    df['Industry (Exiobase)'] = df['Industry (Exiobase)'].apply(lambda x: x if x in industries_to_keep else 'Other')
    
    return df

In [14]:
def _create_interaction_terms(df: pd.DataFrame) -> pd.DataFrame:
    # Creating interaction terms
    df['Use_Sold_x_Processing'] = df['Use of Sold Products'] * df['Processing of Sold Products']
    df['Use_Sold_x_Purchased'] = df['Use of Sold Products'] * df['Purchased Goods and Services']
    df['Processing_x_Purchased'] = df['Processing of Sold Products'] * df['Purchased Goods and Services']
    df['Purchased_x_End_of_Life'] = df['Purchased Goods and Services'] * df['End of Life Treatment of Sold Products']
    
    return df

In [15]:
def _create_polynomial_features(df: pd.DataFrame) -> pd.DataFrame:
    # Creating polynomial features (squared)
    df['Use_Sold_Squared'] = df['Use of Sold Products']**2
    df['Processing_Squared'] = df['Processing of Sold Products']**2
    df['Purchased_Squared'] = df['Purchased Goods and Services']**2
    df['End_of_Life_Squared'] = df['End of Life Treatment of Sold Products']**2
    
    return df

## Encoding Categorical Variables

In [23]:
def _one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encode 'Country' and 'Industry (Exiobase)' columns
    df_encoded = pd.get_dummies(df, columns=['Industry (Exiobase)'])
    return df_encoded

## Normalization/Standardization 

In [24]:
def _normalization(df: pd.DataFrame) -> pd.DataFrame:
    # Create the scaler
    scaler_standard = StandardScaler()

    # Fit the scaler to the data (excluding categorical data if not already encoded)
    df_normalized_standard = pd.DataFrame(scaler_standard.fit_transform(df), columns=df.columns)
    
    return df_normalized_standard

In [25]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    # df = _remap_countries(scope3_data)
    df = _remap_industry(df)
    df = _create_interaction_terms(df)
    df = _create_polynomial_features(df)
    df = _one_hot_encode(df)
    df = _normalization(df)
    df_feature_engineered = df
    
    return df_feature_engineered

In [27]:
df_feature_engineered

Unnamed: 0,Business Travel,Capital Goods,Downstream Leased Assets,Downstream Transportation and Distribution,Employee Commuting,End of Life Treatment of Sold Products,Franchises,Fuel-and-energy-related activities (not included in Scope 1 or 2),Investments,Processing of Sold Products,...,"Industry (Exiobase)_Publishing, printing and reproduction of recorded media (22)",Industry (Exiobase)_Quarrying of sand and clay,Industry (Exiobase)_Real estate activities (70),"Industry (Exiobase)_Recreational, cultural and sporting activities (92)",Industry (Exiobase)_Renting of machinery and equipment without operator and of personal and household goods (71),Industry (Exiobase)_Research and development (73),"Industry (Exiobase)_Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)",Industry (Exiobase)_Sea and coastal water transport,Industry (Exiobase)_Transport via railways,"Industry (Exiobase)_Wholesale trade and commission trade, except of motor vehicles and motorcycles (51)"
0,0.351674,0.285463,0.089172,0.193420,0.258834,0.190253,0.076924,0.125468,0.043358,0.093035,...,-0.13141,-0.171665,-0.157438,13.002177,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
1,0.343210,0.285606,0.088254,0.190489,0.260545,0.190324,0.076924,0.125465,0.043358,0.094346,...,-0.13141,-0.171665,-0.157438,13.002177,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
2,0.353495,0.250433,0.081238,0.119891,0.236674,0.189058,0.076924,0.122606,0.043358,0.049566,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
3,0.345998,-0.242712,0.084694,0.191685,0.247654,0.190324,0.076924,0.122932,0.036937,0.090888,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
4,0.163299,0.257358,0.086375,0.183810,0.233668,0.177002,0.076924,0.123387,0.043358,0.091751,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,0.371776,0.277918,0.088254,0.175243,0.139234,0.188450,0.076924,0.123530,0.043358,0.094346,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,4.331899,-0.08523,-0.099297,-0.078357
9009,0.366116,0.271425,0.087859,0.065963,0.117874,0.176952,0.076924,0.048045,0.043358,0.066135,...,-0.13141,-0.171665,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
9010,0.377083,0.246532,0.061120,0.191431,0.222800,0.190815,0.076924,0.125353,0.043358,0.089885,...,-0.13141,5.825299,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357
9011,0.366818,0.272850,0.087764,0.177369,0.256104,0.167414,0.076924,0.123390,0.043358,0.081429,...,-0.13141,5.825299,-0.157438,-0.076910,-0.096993,-0.108569,-0.230846,-0.08523,-0.099297,-0.078357


In [32]:
df_feature_engineered.to_csv('df_feature_engineered.csv', index=False)

In [29]:
# from tpot import TPOTRegressor
# from sklearn.model_selection import train_test_split

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# # Initialize TPOT
# tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, random_state=42)

# # Fit TPOT
# tpot.fit(X_train, y_train)

# # Evaluate TPOT
# print("TPOT Score:", tpot.score(X_test, y_test))

# # Export the pipeline
# tpot.export('tpot_exported_pipeline.py')

In [52]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Separate the features (X) from the target variable (y)
X = df_feature_engineered.drop("Scope 3", axis=1)
y = df_feature_engineered["Scope 3"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a pipeline with StandardScaler and LassoLarsCV
pipeline = make_pipeline(StandardScaler(), LassoLarsCV())

# # Train the model
# pipeline.fit(X_train, y_train)

# # Make predictions
# y_pred = pipeline.predict(X_test)

# # Evaluate the model
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Print the metrics
# print(f'Mean Absolute Error: {mae}')
# print(f'Mean Squared Error: {mse}')
# print(f'R-squared: {r2}')

from sklearn.model_selection import cross_val_score
import numpy as np

# Create a pipeline with StandardScaler and LassoLarsCV
pipeline = make_pipeline(StandardScaler(), LassoLarsCV())

from sklearn.model_selection import KFold

# Initialize KFold with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation with shuffling
cv_scores = cross_val_score(pipeline, X, y, cv=kf)

# Calculate the mean and standard deviation of the scores
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

# Print the cross-validation metrics
print(f"Cross-Validation Mean with KFold: {cv_mean}")
print(f"Cross-Validation Standard Deviation with KFold: {cv_std}")



Cross-Validation Mean with KFold: 1.0
Cross-Validation Standard Deviation with KFold: 0.0


In [44]:
correlations = df_feature_engineered.corr()['Scope 3'].sort_values(ascending=False)
correlations


Scope [1;36m3[0m                                   [1;36m1.000000[0m
Use of Sold Products                      [1;36m0.912442[0m
Processing of Sold Products               [1;36m0.417482[0m
Purchased Goods and Services              [1;36m0.382445[0m
End of Life Treatment of Sold Products    [1;36m0.371246[0m
                                            [33m...[0m   
End_of_Life_Squared                      [1;36m-0.271452[0m
Processing_Squared                       [1;36m-0.342920[0m
Use_Sold_x_Processing                    [1;36m-0.423886[0m
Use_Sold_x_Purchased                     [1;36m-0.558725[0m
Use_Sold_Squared                         [1;36m-0.768669[0m
Name: Scope [1;36m3[0m, Length: [1;36m67[0m, dtype: float64

In [49]:
from sklearn.linear_model import LinearRegression

simple_pipeline = make_pipeline(StandardScaler(), LinearRegression())
cv_scores_simple = cross_val_score(simple_pipeline, X, y, cv=5)
cv_mean_simple = np.mean(cv_scores_simple)
cv_std_simple = np.std(cv_scores_simple)

print(f"Cross-Validation Mean with Simple Model: {cv_mean_simple}")
print(f"Cross-Validation Standard Deviation with Simple Model: {cv_std_simple}")


Cross-Validation Mean with Simple Model: 1.0
Cross-Validation Standard Deviation with Simple Model: 0.0


In [50]:
import numpy as np

# Add a random feature
X_random = X.copy()
X_random['random'] = np.random.rand(X_random.shape[0])

# Train and evaluate model
cv_scores_random = cross_val_score(pipeline, X_random, y, cv=5)
cv_mean_random = np.mean(cv_scores_random)
cv_std_random = np.std(cv_scores_random)

print(f"Cross-Validation Mean with Random Feature: {cv_mean_random}")
print(f"Cross-Validation Standard Deviation with Random Feature: {cv_std_random}")


Cross-Validation Mean with Random Feature: 1.0
Cross-Validation Standard Deviation with Random Feature: 0.0
