In [1]:
catalog.list()


[1m[[0m
    [32m'companies'[0m,
    [32m'reviews'[0m,
    [32m'shuttles'[0m,
    [32m'preprocessed_companies'[0m,
    [32m'preprocessed_shuttles'[0m,
    [32m'model_input_table'[0m,
    [32m'regressor'[0m,
    [32m'scope3_data_3'[0m,
    [32m'scope3_data_0'[0m,
    [32m'preprocessed_scope3'[0m,
    [32m'parameters'[0m,
    [32m'params:feature_options'[0m,
    [32m'params:feature_options.features'[0m,
    [32m'params:model_options'[0m,
    [32m'params:model_options.test_size'[0m,
    [32m'params:model_options.random_state'[0m,
    [32m'params:model_options.features'[0m
[1m][0m

In [2]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
df = catalog.load("scope3_data_3")

In [None]:
features = catalog.load("params:feature_options")

In [None]:
features


[1m{[0m
    [32m'features'[0m: [1m[[0m
        [32m'Industry [0m[32m([0m[32mExiobase[0m[32m)[0m[32m'[0m,
        [32m'Business Travel'[0m,
        [32m'Capital Goods'[0m,
        [32m'Downstream Leased Assets'[0m,
        [32m'Downstream Transportation and Distribution'[0m,
        [32m'Employee Commuting'[0m,
        [32m'End of Life Treatment of Sold Products'[0m,
        [32m'Fuel-and-energy-related activities [0m[32m([0m[32mnot included in Scope 1 or 2[0m[32m)[0m[32m'[0m,
        [32m'Processing of Sold Products'[0m,
        [32m'Purchased Goods and Services'[0m,
        [32m'Upstream Leased Assets'[0m,
        [32m'Upstream Transportation and Distribution'[0m,
        [32m'Use of Sold Products'[0m,
        [32m'Waste Generated in Operations'[0m,
        [32m'Scope 3'[0m
    [1m][0m
[1m}[0m

In [None]:
#pipelines

In [None]:
#%run_viz

## Handle missing data

In [None]:
def _remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove all rows with missing values in a pandas dataframe.

    Args:
        df (pd.DataFrame): Input pandas DataFrame

    Returns:
        pd.DataFrame: Output DataFrame with rows containing missing values removed.
    """

    df_cleaned = df.dropna()

    return df_cleaned

In [None]:
def preprocess_scope3(scope3_data: pd.DataFrame, parameters: Dict) -> pd.DataFrame:
    """Preprocesses the Scope 3 data.

    Args:
        scope3_data: Raw data.
        
    Returns:
        Preprocessed data, with missing values removed.
    """
    
    df = scope3_data[parameters["features"]]
    df = _remove_missing_values(df)
    preprocessed_data = df
    
    return preprocessed_data

In [None]:
preprocessed_df = preprocess_scope3(df, features)

In [None]:
preprocessed_df

Unnamed: 0,Industry (Exiobase),Business Travel,Capital Goods,Downstream Leased Assets,Downstream Transportation and Distribution,Employee Commuting,End of Life Treatment of Sold Products,Fuel-and-energy-related activities (not included in Scope 1 or 2),Processing of Sold Products,Purchased Goods and Services,Upstream Leased Assets,Upstream Transportation and Distribution,Use of Sold Products,Waste Generated in Operations,Scope 3
0,"Recreational, cultural and sporting activities...",-2.496564e+05,-4.757744e+04,-3.443104e+04,-2.998176e+04,-7.278893e+04,-1.405697e+05,-1.005045e+04,-3.800732e+06,-4.032700e+04,-115870.418895,-1.622937e+05,-2.517427e+05,-1.680007e+04,-4.972821e+06
1,"Recreational, cultural and sporting activities...",-3.122127e+05,-3.505706e+04,-1.100655e+05,-5.560756e+05,-4.256929e+04,-1.260005e+05,-1.206509e+04,-7.884424e+05,-9.811424e+04,-135220.095921,-3.835789e+05,-4.218799e+06,-8.513858e+04,-6.903339e+06
2,Manufacture of basic iron and steel and of fer...,-2.361935e+05,-3.104402e+06,-6.883932e+05,-1.322733e+07,-4.641145e+05,-3.834651e+05,-2.235343e+06,-1.037129e+08,-3.317590e+07,-164244.611459,-4.439044e+06,-3.844370e+08,-8.435292e+07,-6.306213e+08
3,Other land transport,-2.916110e+05,-4.613737e+07,-4.035421e+05,-3.414649e+05,-2.702125e+05,-1.260005e+05,-1.981635e+06,-8.735469e+06,-1.839437e+07,-223065.353185,-9.207954e+05,-5.275860e+04,-2.550743e+05,-8.343143e+07
4,Extraction of crude petroleum and services rel...,-1.641991e+06,-2.500092e+06,-2.649768e+05,-1.754950e+06,-5.172055e+05,-2.835297e+06,-1.627649e+06,-6.753308e+06,-7.203999e+06,-113144.390867,-2.048789e+06,-4.953517e+07,-4.679207e+05,-7.726450e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,"Retail trade, except of motor vehicles and mot...",-1.010736e+05,-7.059217e+05,-1.100655e+05,-3.292632e+06,-2.184806e+06,-5.071892e+05,-1.516787e+06,-7.884424e+05,-1.724398e+06,-222065.999277,-5.627341e+05,-1.586013e+07,-8.513858e+04,-2.766139e+07
9009,Manufacture of basic iron and steel and of fer...,-1.429076e+05,-1.272571e+06,-1.426185e+05,-2.290660e+07,-2.562011e+06,-2.845541e+06,-6.021346e+07,-6.562955e+07,-3.528015e+07,-335318.520693,-1.596656e+07,-2.102340e+08,-1.083696e+06,-4.186150e+08
9010,Quarrying of sand and clay,-6.184726e+04,-3.444812e+06,-2.346714e+06,-3.869935e+05,-7.091088e+05,-2.617897e+04,-9.948010e+04,-1.104070e+07,-1.024349e+05,-72067.302974,-3.921155e+05,-2.190611e+05,-1.214363e+06,-2.011587e+07
9011,Quarrying of sand and clay,-1.377242e+05,-1.148233e+06,-1.504722e+05,-2.910874e+06,-1.210038e+05,-4.785289e+06,-1.625373e+06,-3.047654e+07,-6.031294e+06,-153659.199910,-8.957421e+05,-3.301965e+08,-8.637923e+04,-3.787191e+08


## Outliers detection

In [None]:
def _outlier_removal(df: pd.DataFrame) -> pd.DataFrame:
    # Identify numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns

    # Initialize the IsolationForest model
    clf = IsolationForest(contamination=0.2)  # contamination: proportion of outliers in the data set

    # Fit the model on numerical columns
    clf.fit(df[numerical_cols])

    # Get outlier predictions
    outlier_predictions = clf.predict(df[numerical_cols])

    # Remove outliers from the original DataFrame based on the predictions
    df_filtered = preprocessed_df[outlier_predictions == 1]

    return df_filtered


## Feature engineering

In [None]:
def _remap_industry(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    industries_to_keep = df['Industry (Exiobase)'].value_counts()[df['Industry (Exiobase)'].value_counts() > 50].index
    df['Industry (Exiobase)'] = df['Industry (Exiobase)'].apply(lambda x: x if x in industries_to_keep else 'Other')
    return df

In [None]:
def _create_interaction_terms(df: pd.DataFrame) -> pd.DataFrame:
    interaction_pairs = [
        ('Use of Sold Products', 'Processing of Sold Products'),
        ('Use of Sold Products', 'Purchased Goods and Services'),
        ('Processing of Sold Products', 'Purchased Goods and Services'),
        ('Purchased Goods and Services', 'End of Life Treatment of Sold Products')
    ]
    
    for col1, col2 in interaction_pairs:
        new_col_name = f"{col1}_x_{col2}"
        df[new_col_name] = df[col1] * df[col2]
        
    return df

In [None]:
def _create_polynomial_features(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_square = [
        'Use of Sold Products', 
        'Processing of Sold Products', 
        'Purchased Goods and Services', 
        'End of Life Treatment of Sold Products'
    ]
    
    for col in cols_to_square:
        new_col_name = f"{col}_Squared"
        df[new_col_name] = df[col] ** 2
    
    return df


## Encoding Categorical Variables

In [None]:
def _one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encode 'Country' and 'Industry (Exiobase)' columns
    df_encoded = pd.get_dummies(df, columns=['Industry (Exiobase)'])
    return df_encoded

## Normalization/Standardization 

In [None]:
def _normalization(df: pd.DataFrame) -> pd.DataFrame:
    # Create the scaler
    scaler_standard = StandardScaler()

    # Fit the scaler to the data (excluding categorical data if not already encoded)
    df_normalized_standard = pd.DataFrame(scaler_standard.fit_transform(df), columns=df.columns)
    
    return df_normalized_standard

In [None]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = _outlier_removal(df)
    df = _remap_industry(df)
    df = _create_interaction_terms(df)
    df = _create_polynomial_features(df)
    df = _one_hot_encode(df)
    df = _normalization(df)
    df_feature_engineered = df
    
    return df_feature_engineered

In [None]:
# TO-DO sometimes 61 columns sometimes 62 check why?
df_feature_engineered = feature_engineering(preprocessed_df)

In [None]:
df_feature_engineered

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# # Separate the features (X) from the target variable (y)
# X = df_feature_engineered.drop("Scope 3", axis=1)
# y = df_feature_engineered["Scope 3"]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
parameters = catalog.load("params:model_options")

In [None]:
def split_data(data: pd.DataFrame, model_options: Dict) -> Tuple:
    """Splits data into features and targets training and test sets.

    Args:
        data: Data containing features and target.
        parameters: Parameters defined in parameters/data_science.yml.
    Returns:
        Split data.
    """
    # X = data[parameters["features"]]
    X = data[parameters["features"]].drop("Scope 3", axis=1)
    y = data["Scope 3"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=parameters["test_size"], random_state=parameters["random_state"]
    )
    return X_train, X_test, y_train, y_test

In [None]:
split_df = split_data(df_feature_engineered, parameters)

In [None]:
split_df

In [None]:
def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression:
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return regressor

In [None]:
def evaluate_model(
    regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series
):
    """Calculates and logs the coefficient of determination.

    Args:
        regressor: Trained model.
        X_test: Testing data of independent features.
        y_test: Testing data for price.
    """
    y_pred = regressor.predict(X_test)
    score = r2_score(y_test, y_pred)
    logger = logging.getLogger(__name__)
    logger.info("Model has a coefficient R^2 of %.3f on test data.", score)

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create a pipeline with StandardScaler and LassoLarsCV
pipeline = make_pipeline(StandardScaler(), LassoLarsCV())

from sklearn.model_selection import KFold

# Initialize KFold with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation with shuffling
cv_scores = cross_val_score(pipeline, X, y, cv=kf)

# Calculate the mean and standard deviation of the scores
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

# Print the cross-validation metrics
print(f"Cross-Validation Mean with KFold: {cv_mean}")
print(f"Cross-Validation Standard Deviation with KFold: {cv_std}")