# Plan
1.
2.
3.

# Project Structure:
1. Abstract
2. Load data
3. EDA
4. Preprocess data
5. Feature engineering
6. Train, test split
7. Modelling
8. Model evaluation
9. Hyperparameter tuning 
10. Model Interpretation 
11. Results and conclusions
12. References and Acknowledgments

# Abstract

In [None]:
import polars as pl
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

# Load data

In [None]:
catalog.list()

In [None]:
df = catalog.load("data_from_catalog")

# EDA

1. Data profiling

In [None]:
def data_profiling(df: pd.DataFrame, name: str="data_profiling_report", interface: str="html") -> None:
  """
  This function generates a data profiling report using the pandas_profiling package.
  
  Args:
      df (pd.DataFrame): The DataFrame to profile.
      name (str, optional): The title of the profile report. Defaults to "data_profiling_report".
      interface (str, optional): The format of the report. Defaults to "html".
                                  Choose between 'html' or 'widget'.
  
  Raises:
      ValueError: If df is not a pandas DataFrame or name is not a string or 
                  if interface is not 'html' or 'widget'
  """
  
  # Check if df is a pandas DataFrame
  if not isinstance(df, pd.DataFrame):
    raise ValueError("df should be a pandas DataFrame")
    
  # Check if name is a string
  if not isinstance(name, str):
    raise ValueError("name should be a string")
  
  # Check if interface is a string and a valid option
  if not isinstance(interface, str) or interface not in ['html', 'widget']:
    raise ValueError("interface should be a string, either 'html' or 'widget'")
  
  profile = ProfileReport(df, title=name, explorative=True)

  if interface == "html":
    profile.to_file(f"{name}.html")
    logging.info(f"Report {name} generated in html format, check files.")
    
  elif interface == "widget":
    logging.info(f"Report {name} will be generated as a widget, it might take a while.")
    profile.to_widgets()

# Preprocess data

## Remove missing values

In [None]:
def _remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove all rows with missing values in a pandas dataframe.

    Args:
        df (pd.DataFrame): Input pandas DataFrame

    Returns:
        pd.DataFrame: Output DataFrame with rows containing missing values removed.
    """

    df_cleaned = df.dropna()

    return df_cleaned

## Outliers detection

In [None]:
def _outlier_removal(df: pd.DataFrame) -> pd.DataFrame:
    # Identify numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns

    # Initialize the IsolationForest model
    clf = IsolationForest(contamination=0.2)  # contamination: proportion of outliers in the data set

    # Fit the model on numerical columns
    clf.fit(df[numerical_cols])

    # Get outlier predictions
    outlier_predictions = clf.predict(df[numerical_cols])

    # Remove outliers from the original DataFrame based on the predictions
    df_filtered = df[outlier_predictions == 1]

    return df_filtered


In [None]:
def preprocess_data(data: pd.DataFrame, parameters: Dict) -> pd.DataFrame:
    """Preprocesses data.

    Args:
        data: Raw data.
        
    Returns:
        Preprocessed data, with missing values removed.
    """
    
    df = data[parameters["features"]]
    df = _remove_missing_values(df)
    preprocessed_data = df
    
    return preprocessed_data

In [None]:
preprocessed_df = preprocess_scope3(df, features)

# Feature engineering

## Encoding Categorical Variables

In [None]:
def _one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encode 'Country' and 'Industry (Exiobase)' columns
    df_encoded = pd.get_dummies(df, columns=['Industry (Exiobase)'])
    return df_encoded

## Normalization/Standardization 

In [None]:
def _normalization(df: pd.DataFrame) -> pd.DataFrame:
    # Create the scaler
    scaler_standard = StandardScaler()

    # Fit the scaler to the data (excluding categorical data if not already encoded)
    df_normalized_standard = pd.DataFrame(scaler_standard.fit_transform(df), columns=df.columns)
    
    return df_normalized_standard

In [None]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    Conducts feature engineering on the given DataFrame.

    Steps:
    1. One-Hot Encoding: One-hot encodes categorical features.
    2. Normalization: Standardizes the feature values.
    ...

    Args:
        df: Original DataFrame.

    Returns:
        df_feature_engineered: DataFrame after feature engineering.
    """

    df = _one_hot_encode(df)
    df = _normalization(df)
    df_feature_engineered = df

    return df_feature_engineered

In [None]:
df_feature_engineered = feature_engineering(preprocessed_df)

# Train, test split

In [None]:
def split_data(data: pd.DataFrame, model_options: Dict) -> Tuple:
    """Splits data into features and targets training and test sets.

    Args:
        data: Data containing features and target.
        parameters: Parameters defined in parameters/data_science.yml.
    Returns:
        Split data.
    """
    # X = data[parameters["features"]]
    X = data[parameters["features"]].drop("Scope 3", axis=1)
    y = data["Scope 3"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=parameters["test_size"], random_state=parameters["random_state"]
    )
    return X_train, X_test, y_train, y_test

In [None]:
parameters = catalog.load("params:model_options")

In [None]:
# Assuming df_feature_engineered is your DataFrame and parameters is your configuration dictionary
X_train, X_test, y_train, y_test = split_data(df_feature_engineered, parameters)

# Train

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import logging
import pandas as pd
from typing import Any

def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> Any:
    """Trains the XGBoost model.
    
    Args:
        X_train: Training data of independent features.
        y_train: Training data for target variable.
        
    Returns:
        Trained model.
    """
    params = {
        'alpha': 9.418025790529975e-05,
        'colsample_bytree': 0.73850137825373,
        'eta': 0.03756810920990241,
        'gamma': 1.8103086083962833e-05,
        'lambda': 0.006052853661670603,
        'max_depth': 4,
        'min_child_weight': 1.0000000000000004e-06,
        'objective': 'reg:squarederror',
        'subsample': 0.8954379516782436,
        'eval_metric': ['rmse', 'mae']
    }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    model = xgb.train(params, dtrain, num_boost_round=674)
    return model

In [None]:
# Train the model
trained_model = train_model(X_train, y_train)

# Model evaluation

In [None]:
def evaluate_model(model: Any, X_test: pd.DataFrame, y_test: pd.Series):
    """Calculates and logs the coefficient of determination and RMSE.
    
    Args:
        model: Trained XGBoost model.
        X_test: Testing data of independent features.
        y_test: Testing data for target variable.
    """
    dtest = xgb.DMatrix(X_test, label=y_test)
    y_pred = model.predict(dtest)
    score = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    print(f"Model has a coefficient R^2 of {score:.3f} on test data.")
    print(f"Model has a RMSE of {rmse:.3f} on test data.")
    
    logger = logging.getLogger(__name__)
    logger.info(f"Model has a coefficient R^2 of {score:.3f} on test data.")
    logger.info(f"Model has a RMSE of {rmse:.3f} on test data.")

In [None]:
# Evaluate the model
evaluate_model(trained_model, X_test, y_test)

# Hyperparameter tuning

# Model Interpretation

# Results and conclusions

# References and Acknowledgments
```