In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from datetime import datetime

In [2]:
X_test = pd.read_excel('proyectoFibra.xlsx', index_col=0,)

In [3]:
class RemoveOutliersTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def replace_outliers_with_nan(column):
            column = column.copy() 
            Q1 = column.quantile(0.25)
            Q3 = column.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            column.loc[(column < lower_bound) | (column > upper_bound)] = np.nan
            return column

        for column in X.columns:
            X[column] = replace_outliers_with_nan(X[column])

        X.interpolate(method='linear', inplace=True)
        
        return X

In [4]:
class ImputeNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        imputer = SimpleImputer(strategy='mean')
        valores_nan = X.isnull().sum()
        columnas_con_nan = valores_nan[valores_nan > 0].index
        for columna in columnas_con_nan:
            X.loc[:, columna] = imputer.fit_transform(X[[columna]])
        
        return X

In [5]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df_train_columnas = ['220TV111A.OUT', '220PIT072.PNT', '220LIC154.MEAS', '2202301301II.PNT',
       '22023009V3.PNT', '220FV089.OUT', '220FIC103.MEAS', '220SIT008B.PNT',
       '220PIT061.PNT', '22023010V2.PNT', '220PRODUCAO.RO0011',
       '220TIT105.PNT', '220VIT066.PNT', '220SIT008A.PNT', '220TIT851.PNT',
       '220FIT852.PNT', '220TIT850.PNT', '220PIT167.PNT', '220PIC155.MEAS',
       '220FIC065.MEAS', '220TIT092.PNT', '220FIC089.MEAS', '220FV108.OUT',
       '220TV101A.OUT', '220TIT608.PNT', '220FT108.MEAS', '220FX108.PNT',
       '220FV068.OUT', '220TIT090.PNT', '220FV203.OUT', '2202301001SI.PNT',
       '2202300901SI.PNT', '2202300801SI.PNT', '270LIT451.PNT',
       '270LIT452.PNT', '220FIT107.PNT', '220PIT109.PNT', '220FIC207.MEAS',
       '220FIC208.MEAS', '230LIC030.MEAS', '220PIT110.PNT', '2202301401II.PNT',
       '220TIC111.MEAS', '220PIT106.PNT', '220VIT104.PNT', '220PIT156.PNT',
       '220PIT071.PNT', '220LIT154C.PNT', '220LIT154B.PNT', '2202301001II.PNT',
       '2202300901II.PNT', '220PIT063.PNT', '220PIT064.PNT', '220PIT062.PNT',
       '2202300801II.PNT', '220FIC203.MEAS', '2202100401II.PNT',
       '220YIC008.MEAS', '220FIC463.MEAS', '220PC262A.MEAS', '220TIT007.PNT',
       '220LICORBRANCO.RO0003', '220AIT473.PNT', '2202604801SI.PNT',
       '220PIC019.MEAS', '220SI008_MEM.PNT', '220TV007.OUT', '220TIC529.MEAS',
       '220PIC034.MEAS', '220TIC015.MEAS', '210WIT2094.PNT',
       '220SI008_ACA1.OUT', '220TIC533.MEAS', '220PV034B.OUT', '220LV006.OUT',
       '220TIC007.MEAS', '220PIT003.PNT', '220PIT005.PNT', '220TIT011.PNT',
       '220PIT011.PNT', '220TIC014.MEAS', '2202537801iI.PNT', '220PV019B.OUT',
       '2202604801II.PNT', '220LIT526.PNT', '2202100401SI.PNT',
       '2202100101II.PNT', '220TIC002.MEAS', '220FIT009.PNT', '220FIT011.PNT',
       '20MI661.PNT', '220MIT714.PNT']
        columnas_descartadas = set(X.columns) - set(df_train_columnas)
        X = X.drop(columns=columnas_descartadas)
        
        return X

In [6]:
class SeasonPredictor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):  
        X = X.drop(columns=['220YIC008.MEAS', '220PRODUCAO.RO0011'])
        
        with open('modelo_entrenado.pkl', 'rb') as f:
            mejor_rf = pickle.load(f)
        y_pred = mejor_rf.predict(X)
        X['Cluster'] = y_pred
        
        X['Month-Year'] = X.index.strftime('%Y-%m')
        cluster_counts = X.groupby(['Month-Year', 'Cluster']).size().reset_index(name='Count')
        all_combinations = pd.MultiIndex.from_product([X['Month-Year'].unique(), X['Cluster'].unique()], names=['Month-Year', 'Cluster'])
        cluster_counts = cluster_counts.set_index(['Month-Year', 'Cluster']).reindex(all_combinations, fill_value=0).reset_index()
        
        
        #fecha_actual = datetime.now().strftime("%Y-%m")'
        fecha_actual = '2024-02'
        cluster_counts['Month-Year'] = cluster_counts['Month-Year'].astype(str)
        cluster_counts_filtrado = cluster_counts[cluster_counts['Month-Year'] == fecha_actual]
        indice_maximo = cluster_counts_filtrado['Count'].idxmax()
        cluster_maximo = cluster_counts_filtrado.loc[indice_maximo, 'Cluster']
        print("El cluster con mayor conteo para el {} es: {}".format(fecha_actual, cluster_maximo))

In [7]:
pipeline = Pipeline([
    ('outliers_romover', RemoveOutliersTransformer()),
    ('nan_imputer', ImputeNaNTransformer()),
    ('drop_columns', DropColumnsTransformer()),
    ('seaon_prediction', SeasonPredictor())
])

# Aplica el pipeline a tu dataset
pipeline.fit_transform(X_test)

El cluster con mayor conteo para el 2024-02 es: Verano
