In [2]:
#LIBRARIES TO BE USED

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
#IMPORT DATASETS

df_90_path = "/_datasets/google_trend_pinhao_90percent.csv"
df_90 = pd.read_csv(df_90_path)

df_original_path = "/_datasets/google_trend_pinhao_original.csv"
df_original = pd.read_csv(df_original_path)

In [None]:
#APPLICATION OF THE METHOD

df_90_emmvi = df_90.copy() #Create a new database that will be manipulated by the method.

# Aplicar o método de imputação EMMVI em df_90_emmvi


def expectation_maximization_mvi(df):
    # Copiar o dataframe original para evitar alterações indesejadas
    df_imputed = df.copy()
    
    # Loop para percorrer cada instância da coluna "Pinhao: (Parana)"
    for i, value in enumerate(df_imputed["Pinhao: (Parana)"]):
        # Verificar se o valor é NaN (dados faltantes)
        if np.isnan(value):
            # Implementar o processo de imputação aqui
            # Por exemplo, supondo que o método EMMVI substitua os valores faltantes pela média da coluna
            mean_value = np.mean(df_imputed["Pinhao: (Parana)"].dropna())
            df_imputed["Pinhao: (Parana)"][i] = mean_value
    
    return df_imputed

# Aplicar o método EMMVI ao dataframe df_90_emmvi
df_90_emmvi = expectation_maximization_mvi(df_90_emmvi)

In [None]:
#CALCULATE PERFORMANCE INDEXES/METRICS

mae = mean_absolute_error(df_original["Pinhao: (Parana)"], df_90_emmvi["Pinhao: (Parana)"])
mse = mean_squared_error(df_original["Pinhao: (Parana)"], df_90_emmvi["Pinhao: (Parana)"])
r2 = r2_score(df_original["Pinhao: (Parana)"], df_90_emmvi["Pinhao: (Parana)"])

#CALCULATE HOW MANY IMPUTATION DATA ARE RIGHT
threshold = 0.1  # Definir o limite para considerar uma imputação como "acerto"
accuracy = np.mean(np.abs(df_90_emmvi["Pinhao: (Parana)"] - df_original["Pinhao: (Parana)"]) <= threshold)


In [None]:
#EXPORT PERFORMANCE INDEXES/METRICS TO A LOG FILE
results_path = "/mvi/_results/results.xlsx"
results_df = pd.DataFrame({
    "Data e Hora": [datetime.now()],
    "Dataframe": ["df_90_emmvi"],
    "MAE": [mae],
    "MSE": [mse],
    "R2": [r2],
    "Accuracy": [accuracy]
})
results_df.to_excel(results_path, index=False, mode="a", header=not results_path.exists())


In [None]:
#SCATTERPLOT BETWEEN ORIGINAL DATASET AND IMPUTED DATAFRAME
sns.set(style="whitegrid")
sns.scatterplot(data=df_original, x="Mes", y="Pinhao: (Parana)", color="lightgreen", label="Valores reais")
sns.scatterplot(data=df_90_emmvi, x="Mes", y="Pinhao: (Parana)", color="green", label="EMMVI")
sns.regplot(data=df_90_emmvi, x="Mes", y="Pinhao: (Parana)", color="darkgreen", scatter=False)
plt.xlabel("Mês")
plt.ylabel("Pinhao: (Parana)")
plt.title("Comparação EMMVI vs Valores Reais")
plt.legend()
plt.show()

In [None]:
#EXPORT CHAT
plot_filename = f"/mvi/_results/{df_90_emmvi}_scatterplot_{datetime.now().strftime('%Y%m%d%H%M%S')}.jpg"
plt.savefig(plot_filename)
plt.close()