<p align="center">
  <img src="header.png" width="100%">
</p>


<div style="text-align: center;">
    <strong style="display: block; margin-bottom: 10px;">Group P</strong> 
    <table style="margin: 0 auto; border-collapse: collapse; border: 1px solid black;">
        <tr>
            <th style="border: 1px solid white; padding: 8px;">Name</th>
            <th style="border: 1px solid white; padding: 8px;">Student ID</th>
        </tr>
        <tr>
            <td style="border: 1px solid white; padding: 8px;">Beatriz Monteiro</td>
            <td style="border: 1px solid white; padding: 8px;">20240591</td>
        </tr>
        <tr>
            <td style="border: 1px solid white; padding: 8px;">Catarina Nunes</td>
            <td style="border: 1px solid white; padding: 8px;">20230083</td>
        </tr>
        <tr>
            <td style="border: 1px solid white; padding: 8px;">Margarida Raposo</td>
            <td style="border: 1px solid white; padding: 8px;">20241020</td>
        </tr>
        <tr>
            <td style="border: 1px solid white; padding: 8px;">Teresa Menezes</td>
            <td style="border: 1px solid white; padding: 8px;">20240333</td>
        </tr>
    </table>
</div>

### 🔗 Table of Contents <a id='table-of-contents'></a>
1. [Introduction](#introduction)  
2. [Macro's Prediction](#business-understanding)  
3. [Conclusion](#conclusion)

---

In [2]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats import shapiro
from statsmodels.tsa.stattools import adfuller
from scipy.stats import norm
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import shap
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
def load_dfs_from_folder(folder_path):
    """Loads DataFrames from files in a specified folder and returns a dictionary."""
    dfs = {}
    # List all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pkl"):
            key = file_name.replace(".pkl", "")  # Extract key from the file name
            file_path = os.path.join(folder_path, file_name)
            
            # Load the dataframe from the pickle file
            with open(file_path, 'rb') as f:
                dfs[key] = pickle.load(f)
            print(f"Loaded {key} from {file_path}")
    
    return dfs

# Load both product_dfs and lagged_product_dfs from their respective folders
product_dfs = load_dfs_from_folder("product_dfs_folder")
lagged_product_dfs = load_dfs_from_folder("lagged_product_dfs_folder")

Loaded P11 from product_dfs_folder/P11.pkl
Loaded P13 from product_dfs_folder/P13.pkl
Loaded P12 from product_dfs_folder/P12.pkl
Loaded P16 from product_dfs_folder/P16.pkl
Loaded P14 from product_dfs_folder/P14.pkl
Loaded P8 from product_dfs_folder/P8.pkl
Loaded P9 from product_dfs_folder/P9.pkl
Loaded Sales_CPI from product_dfs_folder/Sales_CPI.pkl
Loaded P1 from product_dfs_folder/P1.pkl
Loaded P3 from product_dfs_folder/P3.pkl
Loaded P6 from product_dfs_folder/P6.pkl
Loaded P4 from product_dfs_folder/P4.pkl
Loaded P5 from product_dfs_folder/P5.pkl
Loaded P36 from product_dfs_folder/P36.pkl
Loaded P20 from product_dfs_folder/P20.pkl
Loaded P11 from lagged_product_dfs_folder/P11.pkl
Loaded P13 from lagged_product_dfs_folder/P13.pkl
Loaded P12 from lagged_product_dfs_folder/P12.pkl
Loaded P16 from lagged_product_dfs_folder/P16.pkl
Loaded P8 from lagged_product_dfs_folder/P8.pkl
Loaded P9 from lagged_product_dfs_folder/P9.pkl
Loaded P1 from lagged_product_dfs_folder/P1.pkl
Loaded P3 fro

In [4]:
for product_id in product_dfs.keys():
    product_dfs[product_id] = product_dfs[product_id].rename(columns={product_id: "Sales"})

for product_id in lagged_product_dfs.keys():
    lagged_product_dfs[product_id] = lagged_product_dfs[product_id].rename(columns={product_id: "Sales"})

In [7]:
product_dfs['P1'].head()

Unnamed: 0_level_0,Sales,MAB_ELE_SHP840,PRI27276_org,PRO27826_org,MAB_ELE_PRO276,MAB_ELE_SHP1100
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-01,35774030.0,127.808839,109.119614,118.670791,124.227879,130.989253
2018-11-01,5063649.0,117.675874,109.224838,120.467019,127.404132,132.93413
2018-12-01,37321270.0,123.280134,109.330063,105.378705,120.518565,131.261348
2019-01-01,27090400.0,111.043755,109.750961,107.174933,104.776326,113.057565
2019-02-01,34132090.0,116.736921,109.856194,110.64764,109.597012,117.704727


In [14]:
product_ids = [1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36]
macros_list = []

# Iterando sobre os IDs de produtos que você forneceu
for prod_id in product_ids:
    key = f'P{prod_id}'  # Construindo a chave do produto (ex: 'P1', 'P3', etc.)
    
    # Verificando se a chave do produto existe no dicionário
    if key in product_dfs:
        df = product_dfs[key]
        
        # Excluindo a coluna 'sales'
        df_without_sales = df.drop(columns=['Sales'])
        
        # Adicionando os dados de macros à lista
        macros_list.append(df_without_sales)

# Concatenando todos os DataFrames em uma única tabela
macros_combinations = pd.concat(macros_list, ignore_index=True)

# Obtendo todas as combinações únicas de macros
unique_combinations = macros_combinations.drop_duplicates()

# Exibindo as combinações únicas
unique_combinations


Unnamed: 0,MAB_ELE_SHP840,PRI27276_org,PRO27826_org,MAB_ELE_PRO276,MAB_ELE_SHP1100,PRO271000_org,PRI27840_org,PRO27380_org,WKLWEUR840_org,PRO27276_org,...,MAB_ELE_SHP276,MAB_ELE_PRO756,PRO27756_org,PRO27392_org,PRO28380_org,PRO28276_org,PRO28826_org,MAB_ELE_SHP250,PRO27250_org,PRO28392_org
0,127.808839,109.119614,118.670791,124.227879,130.989253,,,,,,...,,,,,,,,,,
1,117.675874,109.224838,120.467019,127.404132,132.934130,,,,,,...,,,,,,,,,,
2,123.280134,109.330063,105.378705,120.518565,131.261348,,,,,,...,,,,,,,,,,
3,111.043755,109.750961,107.174933,104.776326,113.057565,,,,,,...,,,,,,,,,,
4,116.736921,109.856194,110.647640,109.597012,117.704727,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,,,,,,,,,,,...,,106.704029,114.326241,,,,,,,
598,,,,,,,,,,,...,,103.499260,108.999212,,,,,,,
599,,,,,,,,,,,...,,100.294492,103.672183,,,,,,,
600,,,,,,,,,,,...,,97.089723,98.345154,,,,,,,


In [17]:
import pandas as pd

# Lista dos números dos produtos que você mencionou
product_ids = [1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36]

# Lista para armazenar os dados de macros de todos os produtos
macros_list = []

# Iterando sobre os IDs de produtos que você forneceu
for prod_id in product_ids:
    key = f'P{prod_id}'  # Construindo a chave do produto (ex: 'P1', 'P3', etc.)
    
    # Verificando se a chave do produto existe no dicionário
    if key in product_dfs:
        df = product_dfs[key]
        
        # Excluindo a coluna 'sales'
        df_sem_sales = df.drop(columns=['Sales'])
        
        # Adicionando os dados de macros à lista
        macros_list.append(df_sem_sales)

# Concatenando todos os DataFrames em uma única tabela
macros_combinados = pd.concat(macros_list, ignore_index=True)

# Obtendo todas as combinações únicas de macros
combinacoes_unicas = macros_combinados.drop_duplicates()

# Exibindo as combinações únicas
combinacoes_unicas


Unnamed: 0,MAB_ELE_SHP840,PRI27276_org,PRO27826_org,MAB_ELE_PRO276,MAB_ELE_SHP1100,PRO271000_org,PRI27840_org,PRO27380_org,WKLWEUR840_org,PRO27276_org,...,MAB_ELE_SHP276,MAB_ELE_PRO756,PRO27756_org,PRO27392_org,PRO28380_org,PRO28276_org,PRO28826_org,MAB_ELE_SHP250,PRO27250_org,PRO28392_org
0,127.808839,109.119614,118.670791,124.227879,130.989253,,,,,,...,,,,,,,,,,
1,117.675874,109.224838,120.467019,127.404132,132.934130,,,,,,...,,,,,,,,,,
2,123.280134,109.330063,105.378705,120.518565,131.261348,,,,,,...,,,,,,,,,,
3,111.043755,109.750961,107.174933,104.776326,113.057565,,,,,,...,,,,,,,,,,
4,116.736921,109.856194,110.647640,109.597012,117.704727,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,,,,,,,,,,,...,,106.704029,114.326241,,,,,,,
598,,,,,,,,,,,...,,103.499260,108.999212,,,,,,,
599,,,,,,,,,,,...,,100.294492,103.672183,,,,,,,
600,,,,,,,,,,,...,,97.089723,98.345154,,,,,,,


In [20]:
import pandas as pd

# Lista dos números dos produtos que você mencionou
product_ids = [1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36]


# Dicionário para armazenar as combinações únicas de macros e os produtos onde aparecem
combinacoes_produtos = {}

# Iterando sobre os IDs de produtos que você forneceu
for prod_id in product_ids:
    key = f'P{prod_id}'  # Construindo a chave do produto (ex: 'P1', 'P3', etc.)
    
    # Verificando se a chave do produto existe no dicionário
    if key in product_dfs:
        df = product_dfs[key]
        
        # Excluindo a coluna 'sales'
        df_sem_sales = df.drop(columns=['Sales'])
        
        # Iterando sobre cada linha do DataFrame para identificar as combinações únicas de macros
        for idx, row in df_sem_sales.iterrows():
            combinacao_macros = tuple(row)  # Tupla que representa a combinação de macros
            
            # Adicionando o produto à lista de produtos para essa combinação de macros
            if combinacao_macros not in combinacoes_produtos:
                combinacoes_produtos[combinacao_macros] = []
            
            combinacoes_produtos[combinacao_macros].append(key)

# Convertendo o dicionário em um DataFrame para visualização
combinacoes_df = pd.DataFrame(list(combinacoes_produtos.items()), columns=['Combinação de Macros', 'Produtos'])
combinacoes_df['Produtos'] = combinacoes_df['Produtos'].apply(lambda x: ', '.join(x))  # Lista de produtos como string

# Exibindo o DataFrame final
combinacoes_df


Unnamed: 0,Combinação de Macros,Produtos
0,"(127.80883943812468, 109.1196136474609, 118.67...",P1
1,"(117.67587388309826, 109.2248382568359, 120.46...",P1
2,"(123.28013378640512, 109.3300628662109, 105.37...",P1
3,"(111.04375462185241, 109.7509613037109, 107.17...",P1
4,"(116.73692110885848, 109.8561935424805, 110.64...",P1
...,...,...
597,"(106.70402895817061, 114.32624113475178, 310.7...",P36
598,"(103.49926026575272, 108.99921197793539, 235.9...",P36
599,"(100.29449157333487, 103.672182821119, 235.956...",P36
600,"(97.08972288091698, 98.3451536643026, 329.4133...",P36


In [21]:
import pandas as pd

# Lista dos números dos produtos que você mencionou
product_ids = [1, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 20, 36]


# Dicionário para armazenar as macros e os produtos onde elas aparecem
macros_produtos = {}

# Iterando sobre os IDs de produtos que você forneceu
for prod_id in product_ids:
    key = f'P{prod_id}'  # Construindo a chave do produto (ex: 'P1', 'P3', etc.)
    
    # Verificando se a chave do produto existe no dicionário
    if key in product_dfs:
        df = product_dfs[key]
        
        # Iterando sobre cada coluna de macros
        for macro in ['proteinas', 'carboidratos', 'gorduras']:
            # Se a macro estiver no DataFrame, vamos verificar se o valor existe
            if macro in df.columns:
                # Adicionando o produto à lista de produtos para essa macro
                for value in df[macro]:
                    if key not in macros_produtos[macro]:
                        macros_produtos[macro].append(key)

# Convertendo o dicionário em um DataFrame para visualização
macros_df = pd.DataFrame(list(macros_produtos.items()), columns=['Macro', 'Produtos'])
macros_df['Produtos'] = macros_df['Produtos'].apply(lambda x: ', '.join(x))  # Lista de produtos como string

# Exibindo o DataFrame final
print(macros_df)


Empty DataFrame
Columns: [Macro, Produtos]
Index: []


In [23]:
product_dfs['P1'].columns

Index(['Sales', 'MAB_ELE_SHP840', 'PRI27276_org', 'PRO27826_org',
       'MAB_ELE_PRO276', 'MAB_ELE_SHP1100'],
      dtype='object')

In [24]:
# Iterar sobre todos os DataFrames no dicionário `product_dfs` e exibir as colunas
for key, df in product_dfs.items():
    print(f"Colunas do DataFrame {key}:")
    print(df.columns)
    print()  # Apenas para separar as saídas de cada DataFrame


Colunas do DataFrame P11:
Index(['Sales', 'PRO27826_org', 'MAB_ELE_SHP392', 'MAB_ELE_SHP840',
       'MAB_ELE_SHP276'],
      dtype='object')

Colunas do DataFrame P13:
Index(['Sales', 'MAB_ELE_PRO756', 'PRO27756_org', 'MAB_ELE_PRO276',
       'PRI27840_org'],
      dtype='object')

Colunas do DataFrame P12:
Index(['Sales', 'PRI27840_org', 'RohCOPPER1000_org', 'MAB_ELE_PRO156'], dtype='object')

Colunas do DataFrame P16:
Index(['Sales', 'MAB_ELE_PRO756', 'PRO28276_org', 'PRI27276_org',
       'PRO28826_org'],
      dtype='object')

Colunas do DataFrame P14:
Index(['Sales', 'PRO27392_org', 'PRO28380_org', 'PRO27756_org'], dtype='object')

Colunas do DataFrame P8:
Index(['Sales', 'PRI27840_org', 'RohCOPPER1000_org'], dtype='object')

Colunas do DataFrame P9:
Index(['Sales', 'PRO27826_org', 'PRO271000_org', 'PRO28250_org',
       'MAB_ELE_PRO156'],
      dtype='object')

Colunas do DataFrame Sales_CPI:
Index(['Sales', 'MAB_ELE_SHP840', 'PRI27276_org', 'PRO271000_org',
       'PRO27826_org

In [30]:
import pandas as pd

# Lista de produtos para evitar na coluna 'Macro'
produtos = {"P1", "P3", "P4", "P5", "P6", "P8", "P9", "P11", "P12", "P13", "P14", "P16", "P20", "P36"}

# Dicionário de DataFrames com os dados dos produtos
product_dfs = {
    'P1': pd.read_pickle('product_dfs_folder/P1.pkl'),
    'P3': pd.read_pickle('product_dfs_folder/P3.pkl'),
    'P4': pd.read_pickle('product_dfs_folder/P4.pkl'),
    'P5': pd.read_pickle('product_dfs_folder/P5.pkl'),
    'P6': pd.read_pickle('product_dfs_folder/P6.pkl'),
    'P8': pd.read_pickle('product_dfs_folder/P8.pkl'),
    'P9': pd.read_pickle('product_dfs_folder/P9.pkl'),
    'P11': pd.read_pickle('product_dfs_folder/P11.pkl'),
    'P12': pd.read_pickle('product_dfs_folder/P12.pkl'),
    'P13': pd.read_pickle('product_dfs_folder/P13.pkl'),
    'P14': pd.read_pickle('product_dfs_folder/P14.pkl'),
    'P16': pd.read_pickle('product_dfs_folder/P16.pkl'),
    'P20': pd.read_pickle('product_dfs_folder/P20.pkl'),
    'P36': pd.read_pickle('product_dfs_folder/P36.pkl'),
}

# Dicionário para armazenar as macros e os produtos correspondentes
identificadores_produtos = {}

# Iterando sobre os DataFrames
for key, df in product_dfs.items():
    # Iterando sobre as colunas (exceto 'Sales')
    for col in df.columns:
        if col != 'Sales' and col not in produtos:  # Ignora 'Sales' e produtos na coluna 'Macro'
            if col not in identificadores_produtos:
                identificadores_produtos[col] = []
            identificadores_produtos[col].append(key)

# Convertendo o dicionário em um DataFrame para visualização
identificadores_df = pd.DataFrame(list(identificadores_produtos.items()), columns=['Macro', 'Produtos'])
identificadores_df['Produtos'] = identificadores_df['Produtos'].apply(lambda x: ', '.join(x))  # Lista de produtos como string

# Exibindo o DataFrame final
identificadores_df


Unnamed: 0,Macro,Produtos
0,MAB_ELE_SHP840,"P1, P5, P11"
1,PRI27276_org,"P1, P16"
2,PRO27826_org,"P1, P3, P5, P9, P11"
3,MAB_ELE_PRO276,"P1, P13"
4,MAB_ELE_SHP1100,P1
5,PRO271000_org,"P3, P5, P9, P20"
6,PRI27840_org,"P3, P8, P12, P13"
7,PRO27380_org,P4
8,WKLWEUR840_org,P4
9,PRO27276_org,"P4, P6"


In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
model = ARIMA(df_combined['Sales_EUR'], order=(1,1,1))  # Adjust the order as needed
model_fit = model.fit()

# Forecast the next 12 months
forecast = model_fit.forecast(steps=12)
print(forecast)


In [75]:
from IPython.display import display, HTML
def display_side_by_side(dfs: list, captions: list, tablespacing=5, width="auto", height="auto"):
    html = """
    <div style='overflow-x: auto; white-space: nowrap; padding-bottom: 10px; border-bottom: 2px solid #ddd;'>
    """
    for caption, df in zip(captions, dfs):
        styled_df = df.style.set_caption(caption)._repr_html_()
        html += f"""
        <div style='display:inline-block; margin-right:{tablespacing * 5}px; vertical-align:top;'>
            <div style='width:{width}; max-height:{height}; overflow:auto; border:1px solid #ccc; padding:5px;'>
                {styled_df}
            </div>
        </div>
        """
    html += "</div>"  # Fecha a div de rolagem horizontal
    display(HTML(html))

# adapted from Professor João Caldeira from NOVA IMS function

In [None]:
# List of all unique products (already obtained)
print("\nList of Unique Products:")
print(unique_products)

# Create a table (DataFrame) to visualize the relationship Macro -> Products
macro_product_table = {'Macro': [], 'Products': []}

for macro_name, macro_value in macro_variables.items():
    macro_product_table['Macro'].append(macro_name)
    # Since each product is associated with the same set of macro variables,
    # we can simply list all products for each macro variable.
    macro_product_table['Products'].append(unique_products)

df_macro_product = pd.DataFrame(macro_product_table)

print("\nTable Macro Variable -> Products:")
print(df_macro_product)

In [None]:
import pandas as pd

# Definindo as listas de produtos e variáveis macro
product_columns = ['P1', 'P3', 'P4', 'P5', 'P6', 'P8', 'P9', 'P11', 'P12', 'P13', 'P14', 'P16', 'P20', 'P36']
macro_vars_cols = ['MAB_ELE_PRO156', 'MAB_ELE_SHP156', 'MAB_ELE_PRO250', 'MAB_ELE_SHP250', 'MAB_ELE_PRO276',
                   'MAB_ELE_SHP276', 'MAB_ELE_PRO380', 'MAB_ELE_SHP380', 'MAB_ELE_PRO392', 'MAB_ELE_SHP392',
                   'MAB_ELE_PRO756', 'MAB_ELE_SHP756', 'MAB_ELE_PRO826', 'MAB_ELE_PRO840', 'MAB_ELE_SHP840',
                   'MAB_ELE_PRO1100', 'MAB_ELE_SHP1100', 'RohiBASEMET1000_org', 'RohiENERGY1000_org',
                   'RohiMETMIN1000_org', 'RohiNATGAS1000_org', 'RohCRUDE_PETRO1000_org', 'RohCOPPER1000_org',
                   'WKLWEUR840_org', 'PRI27840_org', 'PRI27380_org', 'PRI27250_org', 'PRI27276_org',
                   'PRI27156_org', 'PRO28840_org', 'PRO281000_org', 'PRO28756_org', 'PRO28826_org',
                   'PRO28380_org', 'PRO28392_org', 'PRO28250_org', 'PRO28276_org', 'PRO27840_org',
                   'PRO271000_org', 'PRO27756_org', 'PRO27826_org', 'PRO27380_org', 'PRO27392_org',
                   'PRO27250_org', 'PRO27276_org']

# Criando um dicionário de associação entre variáveis macro e produtos
macro_to_products = {
    'MAB_ELE_PRO156': ['P1'], 'MAB_ELE_SHP156': ['P1'], 'MAB_ELE_PRO250': ['P3'], 'MAB_ELE_SHP250': ['P3'],
    'MAB_ELE_PRO276': ['P4'], 'MAB_ELE_SHP276': ['P4'], 'MAB_ELE_PRO380': ['P5'], 'MAB_ELE_SHP380': ['P5'],
    'MAB_ELE_PRO392': ['P6'], 'MAB_ELE_SHP392': ['P6'], 'MAB_ELE_PRO756': ['P8'], 'MAB_ELE_SHP756': ['P8'],
    'MAB_ELE_PRO826': ['P9'], 'MAB_ELE_PRO840': ['P11'], 'MAB_ELE_SHP840': ['P11'], 'MAB_ELE_PRO1100': ['P12'],
    'MAB_ELE_SHP1100': ['P12'], 'RohiBASEMET1000_org': ['P13'], 'RohiENERGY1000_org': ['P14'],
    'RohiMETMIN1000_org': ['P16'], 'RohiNATGAS1000_org': ['P20'], 'RohCRUDE_PETRO1000_org': ['P36'],
    'RohCOPPER1000_org': ['P13'], 'WKLWEUR840_org': ['P8'], 'PRI27840_org': ['P5'], 'PRI27380_org': ['P6'],
    'PRI27250_org': ['P3'], 'PRI27276_org': ['P4'], 'PRI27156_org': ['P1'], 'PRO28840_org': ['P16'],
    'PRO281000_org': ['P14'], 'PRO28756_org': ['P8'], 'PRO28826_org': ['P9'], 'PRO28380_org': ['P5'],
    'PRO28392_org': ['P6'], 'PRO28250_org': ['P3'], 'PRO28276_org': ['P4'], 'PRO27840_org': ['P5'],
    'PRO271000_org': ['P1'], 'PRO27756_org': ['P8'], 'PRO27826_org': ['P9'], 'PRO27380_org': ['P6'],
    'PRO27392_org': ['P6'], 'PRO27250_org': ['P3'], 'PRO27276_org': ['P4']
}

# Convertendo o dicionário em um DataFrame
data = [(macro, ', '.join(products)) for macro, products in macro_to_products.items()]
df = pd.DataFrame(data, columns=['Macro Variable', 'Products'])

# Exibindo a tabela
print(df)

In [None]:
# Assuming your merged DataFrame is called 'merged_df' with a time index
# and columns for product sales (e.g., 'P1', 'P3') and macro variables
# Let's also assume your macro variables are in columns named like 'Macro1', 'Macro2', etc.

product_columns = ['P1', 'P3', 'P4', 'P5', 'P6', 'P8', 'P9', 'P11', 'P12', 'P13', 'P14', 'P16', 'P20', 'P36']
macro_vars_cols = ['MAB_ELE_PRO156', 'MAB_ELE_SHP156', 'MAB_ELE_PRO250', 'MAB_ELE_SHP250', 'MAB_ELE_PRO276',
                   'MAB_ELE_SHP276', 'MAB_ELE_PRO380', 'MAB_ELE_SHP380', 'MAB_ELE_PRO392', 'MAB_ELE_SHP392',
                   'MAB_ELE_PRO756', 'MAB_ELE_SHP756', 'MAB_ELE_PRO826', 'MAB_ELE_PRO840', 'MAB_ELE_SHP840',
                   'MAB_ELE_PRO1100', 'MAB_ELE_SHP1100', 'RohiBASEMET1000_org', 'RohiENERGY1000_org',
                   'RohiMETMIN1000_org', 'RohiNATGAS1000_org', 'RohCRUDE_PETRO1000_org', 'RohCOPPER1000_org',
                   'WKLWEUR840_org', 'PRI27840_org', 'PRI27380_org', 'PRI27250_org', 'PRI27276_org',
                   'PRI27156_org', 'PRO28840_org', 'PRO281000_org', 'PRO28756_org', 'PRO28826_org',
                   'PRO28380_org', 'PRO28392_org', 'PRO28250_org', 'PRO28276_org', 'PRO27840_org',
                   'PRO271000_org', 'PRO27756_org', 'PRO27826_org', 'PRO27380_org', 'PRO27392_org',
                   'PRO27250_org', 'PRO27276_org']

results = {}

for product in product_columns:
    print(f"--- Processing Product: {product} ---")
    product_data = df_market_sales[[product] + macro_vars_cols].dropna() # Use only data where product sales and macros are available

    product_results = {}
    for macro in macro_vars_cols:
        print(f"  Predicting Macro Variable: {macro}")
        X = product_data[[product]]  # Use product sales as the predictor
        y = product_data[macro]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        product_results[macro] = {
            'model': model,
            'mse': mse
        }
        print(f"    MSE: {mse:.4f}")

        # Optional: Visualize predictions
        plt.figure(figsize=(10, 6))
        plt.plot(y_test.index, y_test, label='Actual')
        plt.plot(y_test.index, y_pred, label='Predicted')
        plt.title(f'Product: {product}, Macro: {macro} - Actual vs. Predicted')
        plt.xlabel('Time')
        plt.ylabel(macro)
        plt.legend()
        plt.show()

    results[product] = product_results

# Now the 'results' dictionary contains models and MSEs for each macro variable, specific to each product.

### <span style="background-color:#000027; padding:5px; border-radius:5px;"> 📌 Prediction for the Macro Features used <a id='business-understanding'></a>

##### Click [here](#table-of-contents) ⬆️ to return to the Index.
---

In [None]:
# Define the function to classify variables
def classify_variable(series):
    """Classifies variable based on normality and stationarity tests."""
    
    # Remove NaN values for testing
    clean_series = series.dropna()

    # Check for normality
    if len(clean_series) > 3:
        stat, p_value = shapiro(clean_series)
        is_normal = p_value > 0.05  # p-value > 0.05 means normal
    else:
        is_normal = False  # Not enough data to test normality

    # Check for stationarity
    if len(clean_series) > 3:
        adf_stat, adf_p_value, _, _, _, _ = adfuller(clean_series)
        is_stationary = adf_p_value < 0.05  # p-value < 0.05 means stationary
    else:
        is_stationary = False  # Not enough data

    return is_normal, is_stationary

# Function to automatically fill missing values
def auto_impute_missing_values(df_train, df_test):
    """Automatically selects the best imputation method for each missing variable."""
    
    # Identify missing columns in test set
    missing_columns = df_test.columns[df_test.isnull().any()]
    
    # Iterate through missing columns
    for col in missing_columns:
        print(f"Processing: {col}")

        series = df_train[col]  # Use train data for imputation
        is_normal, is_stationary = classify_variable(series)

        if is_normal:
            # Case 1: Normally distributed → Sample from normal distribution
            print(f" - {col} is normal → Using Mean & Std Sampling")
            mean_value, std_value = series.mean(), series.std()
            num_missing = df_test[col].isnull().sum()
            predictions = norm.rvs(loc=mean_value, scale=std_value, size=num_missing)
        
        elif is_stationary:
            # Case 2: Stationary but non-normal → Simple Exponential Smoothing
            print(f" - {col} is stationary → Using Simple Exponential Smoothing")
            model = SimpleExpSmoothing(series.dropna()).fit()
            predictions = model.forecast(steps=df_test[col].isnull().sum())

        elif not is_stationary:
            # Case 3: Non-Stationary → ARIMA
            print(f" - {col} is non-stationary → Using ARIMA")
            model = ARIMA(series.dropna(), order=(1, 1, 1))  # (p,d,q) chosen based on domain knowledge
            fitted_model = model.fit()
            predictions = fitted_model.forecast(steps=df_test[col].isnull().sum())

        else:
            # Case 4: If nothing works → Use XGBoost Regression
            print(f" - {col} is complex → Using XGBoost Regression")
            train_data = df_train.dropna(subset=[col])  # Drop missing values for training
            X_train = train_data.drop(columns=[col])  # Exclude target column
            y_train = train_data[col]  # Target column

            X_test = df_test.loc[df_test[col].isnull(), X_train.columns]  # Only missing values

            model = XGBRegressor(n_estimators=100, learning_rate=0.1)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

        # Assign predictions
        missing_indexes = df_test[df_test[col].isnull()].index
        df_test.loc[missing_indexes, col] = predictions

    return df_test

# Example usage
df_train = remerged_data[1]  # Use remerged train data
df_test = test_1.copy()  # Copy test set

# Apply automatic imputation
df_test_filled = auto_impute_missing_values(df_train, df_test)

# Check results
print(df_test_filled.head())