In [3]:
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class DefineLimits(BaseEstimator, TransformerMixin):
    """
    Change columns that contain punctual, minimum and maximum values
    into two separate columns that indicate the limits of an observation,
    plus one that determines if an observation is of instance "No Gotea"

    
    Gets - dataframes with varying ranges of values within observations
         - COLUMNS: a list of columns to be modified, if any, applies to all dataframe

         - MARGINS:  which ads or subtracts the maximum found value by itself times the value of margin

         - VERBOSE: a boolean; when true, prints expansions made, limits determined per column and summary of data


    Out - dataframe that, per each of its original conflicting columns, 
          it will have two new columns with the maximum and minimum per column per observation, 

            - if any given column contains a value "No Gotea", it creates a new binary column
              accordingly which maps 1 in observations that contain it 
              (min = max = 0 for that observation) and 0 elsewhere.


    FIT: it computes minimum and maximum registered values

            - expands if needed
            - creates No Gotea column id needed

    TRANSFORM: 
            - adjust values to fit between memorized global minimums and maximums of each column
                - if expansion is needed, updates all expanded limits to the new ones
            - if memorized dataframe didn't create a No Gotea column, it won't for new data frames
        
    """

    def __init__(self, columns=None, margin=0.0, verbose=False):

        #Take attributes
        self.columns = columns
        self.margin = float(margin)
        self.verbose = verbose

        # Persistent states
        self.col_global_min_ = {}
        self.col_global_max_ = {}

        self.col_has_expanded_min_ = {}
        self.col_has_expanded_max_ = {}
        self.col_is_nogotea_ = {}

        # creates  empty list to store a history/log of all expansions 
        # that occur during both the fit() and transform() operations
        self.expansion_log = [] 
        

    # ----------------------------------------------------------------------
    #                       VECTORIZED PARSER HELPER
    # ----------------------------------------------------------------------
    @staticmethod
    def _parse_series(series):

        """  
        Helper gunction that converts columns observations to clean data, handles misproduced values
        from previous transformer pipelines

        Gets: pandas series that represents a whole column

        Out: 
            - types: categorizes values to know how to process them in later stages
                     - 'num' - Regular numeric value
                     - 'lt' - Less-than notation (<x)
                     - 'gt' - Greater-than notation (>x)
                     - 'range' - Range notation (x-y)
                     - 'nogotea' - "No Gotea" text
                     - 'invalid' - Unparseable value ('N/A', 'unknown', empty string)
            - numeric: raw values
            - range_mins: lower values of [x-y] type observations
            - range_maxs: upper values of [x-y] type observations
        """

        #transform recieved column values to string
        ser_str = series.astype(str)
        
        # Clean up string
        ser_str = ser_str.str.replace("＜", "<", regex=False) #map characters to conventional <,> not unicode
        ser_str = ser_str.str.replace("＞", ">", regex=False)
        ser_str = ser_str.str.replace(" ", "", regex=True)


        #----------------BOOLEANS----------------
        # no gotea
        lower = ser_str.str.lower()
        is_nogotea = lower == "nogotea"
        
        # dash boolean; Handle ranges like "265-295"
        has_dash = ser_str.str.contains(r'^\d+-\d+$', regex=True)
        
        #<,>  boolean mask per observation; Handle < and > values
        starts_lt = ser_str.str.startswith("<", na=False)   
        starts_gt = ser_str.str.startswith(">", na=False)
        


        # ---------------- ATTTEND RANGES ----------------------------
        # For ranges, create empty lists
        range_mins = np.full(len(ser_str), np.nan)
        range_maxs = np.full(len(ser_str), np.nan)
        
        if has_dash.any():
            # extract in ser_str only where has_dash is true;
            #returns only ranges values
            ranges = ser_str[has_dash]
            split_ranges = ranges.str.split("-", expand=True)
            range_mins[has_dash] = pd.to_numeric(split_ranges[0], errors='coerce')
            range_maxs[has_dash] = pd.to_numeric(split_ranges[1], errors='coerce')
        
        #------------------------------------------------------------------------
        


        # For regular numeric (removing < and >) and catches untreated dashes
        numeric_str = ser_str.str.replace(r"^[<>]", "", regex=True)
        numeric_str = numeric_str.str.replace(r"-.+$", "", regex=True)
        numeric = pd.to_numeric(numeric_str, errors="coerce")
        

        # ---------------- ASSIGN TYPES ----------------------------------------
        # create list with nums
        types = np.array(["num"] * len(ser_str), dtype=object)


        types[is_nogotea.values] = "nogotea"
        types[starts_lt.values] = "lt"
        types[starts_gt.values] = "gt"
        types[has_dash.values] = "range"
        
        # attend unassigned values, numerics stay of type "num"
        invalid_mask = (types == "num") & (numeric.isna().values)
        types[invalid_mask] = "invalid"
        #------------------------------------------------------------------------
        
        return types, numeric, range_mins, range_maxs


    # ----------------------------------------------------------------------
    #                       LOG EXPANSION HELPER
    # ----------------------------------------------------------------------

    # FINISH 

    def _log_expansion(self, stage, col, limit_type, old_value, new_value, reason=""):
        """
        Function that saves the historical changes


        """
        expansion_info = {
            "stage": stage,
            "column": col,
            "limit_type": limit_type,
            "old_value": old_value,
            "new_value": new_value,
            "margin": self.margin,
            "reason": reason
        }
        self.expansion_log.append(expansion_info)
        
        if self.verbose:
            margin_str = f" (margin={self.margin})" if self.margin != 0 else ""
            print(f"[{stage.upper()}] {col}: {limit_type.upper()} EXPANDED {old_value} → {new_value}{margin_str} {reason}")

    # ----------------------------------------------------------------------
    #
    #                                FIT
    #
    # ----------------------------------------------------------------------

    def fit(self, X, y=None):
        """
        """
        X = X.copy()
        # if list of columns, use; if None, do all datafram
        cols = self.columns or list(X.columns)
        
        #------------------Print which Stage------------------
        if self.verbose:
            print("=" * 80)
            print("FITTING STAGE")
            print("=" * 80)
        #-----------------------------------------------------


        for col in cols:
            #extract series
            ser = X[col]
            #parse info
            types, nums, range_mins, range_maxs = self._parse_series(ser)

            # No Gotea Boolean; save in dictionary per column for transform
            has_nogotea = (types == "nogotea").any()
            self.col_is_nogotea_[col] = bool(has_nogotea)

            #--------------------- STORE VALID NUMBERS------------------------------------------
            #initialize valid numeric values (including from ranges)
            valid_nums = []
            
            # Regular numbers
            num_mask = (types == "num") & np.isfinite(nums)
            valid_nums.extend(nums[num_mask].tolist())
            
            # <, > BOOLEANS, values (use the numeric part)
            lt_mask = (types == "lt") & np.isfinite(nums)
            gt_mask = (types == "gt") & np.isfinite(nums)


            # For <x, the actual value is less than x, so x is an UPPER bound
            # For >x, the actual value is greater than x, so x is a LOWER bound
            valid_nums.extend(nums[lt_mask].tolist())
            valid_nums.extend(nums[gt_mask].tolist())
            
            # Range values BOOLEAN
            range_mask = (types == "range")
            valid_nums.extend(range_mins[range_mask & np.isfinite(range_mins)].tolist())
            valid_nums.extend(range_maxs[range_mask & np.isfinite(range_maxs)].tolist())
            # ----------------------------------------------------------------------------------



            if len(valid_nums) == 0:
                init_min = 0.0
                init_max = 0.0
            else:
                init_min = float(min(valid_nums))
                init_max = float(max(valid_nums))

            global_min = init_min
            global_max = init_max
            expanded_min = False
            expanded_max = False
            expansion_reason_min = ""
            expansion_reason_max = ""
            
            
            #--------------------------------  UPPER BOUNDS  ---------------------------------------------------
            #  x is an UPPER bound, and  might need to expand LOWER bound downward
            lt_mask = (types == "lt") & np.isfinite(nums)
            if lt_mask.any():
                vals_lt = nums[lt_mask]
                if len(vals_lt) > 0:
                    # consider expanding the LOWER bound (global_min) downward
                    # by looking at the smallest < value
                    min_lt_value = float(vals_lt.min())
                    
                    # If margin > 0, expand downward from the smallest < value
                    if self.margin > 0:
                        old = global_min
                        #if there is non zero expansion, else, expand by 0
                        expansion_amount = self.margin * abs(min_lt_value) if min_lt_value != 0 else self.margin
                        global_min = min_lt_value - expansion_amount
                        expanded_min = True

                        #save all < values in print
                        expansion_reason_min = f"due to < values (actual < {sorted(set(vals_lt))})"
                        self._log_expansion("fit", col, "min", old, global_min, expansion_reason_min)
                    
                    #------------------Print No Expansion due to 0 margin------------------
                    if self.verbose and self.margin == 0:
                        print(f"[FIT] {col}: Found < values {sorted(set(vals_lt))} but margin=0, so no expansion")

            #--------------------------------  UPPER BOUNDS  ---------------------------------------------------
            # x is a LOWER bound, might need to expand UPPER bound upward
            gt_mask = (types == "gt") & np.isfinite(nums)
            if gt_mask.any():
                vals_gt = nums[gt_mask]
                if len(vals_gt) > 0:
                    # expanding the UPPER bound (global_max) upward
                    # by looking at the largest > value
                    max_gt_value = float(vals_gt.max())
                    
                    # If margin > 0, expand upward from the largest > value
                    if self.margin > 0:
                        old = global_max
                        #if there is non zero expansion, else, expand by 0
                        expansion_amount = self.margin * abs(max_gt_value) if max_gt_value != 0 else self.margin
                        global_max = max_gt_value + expansion_amount
                        expanded_max = True

                        #save all > values in print
                        expansion_reason_max = f"due to > values (actual > {sorted(set(vals_gt))})"
                        self._log_expansion("fit", col, "max", old, global_max, expansion_reason_max)

                    #------------------Print No Expansion due to 0 margin------------------
                    if self.verbose and self.margin == 0:
                        print(f"[FIT] {col}: Found > values {sorted(set(vals_gt))} but margin=0, so no expansion")

            # Update attributes of column
            self.col_global_min_[col] = float(global_min)
            self.col_global_max_[col] = float(global_max)
            self.col_has_expanded_min_[col] = bool(expanded_min)
            self.col_has_expanded_max_[col] = bool(expanded_max)
            
            #------------------Print FINAL limits and types counts------------------
            if self.verbose:
                print(f"[FIT] {col}: Final limits = [{global_min}, {global_max}]")
                type_counts = pd.Series(types).value_counts()
                if len(type_counts) > 0:
                    print(f"[FIT] {col}: Type counts: {type_counts.to_dict()}")

        return self

    # ----------------------------------------------------------------------
    #
    #                              TRANSFORM  
    #  
    # ----------------------------------------------------------------------
    def transform(self, X):
        """
        """
        X = X.copy()
        cols = self.columns or list(X.columns)

        #------------------Print which Stage------------------
        if self.verbose:
            print("\n" + "=" * 80)
            print("TRANSFORMING STAGE")
            print("=" * 80)
        #------------------------------------------------------

        # initialize dictionaries
        out = {}
        pending_new_min = {}
        pending_new_max = {}

        for col in cols:
            ser = X[col]

            #get parse info
            types, nums, range_mins, range_maxs = self._parse_series(ser)

            #fetch fit limits
            global_min = self.col_global_min_[col]
            global_max = self.col_global_max_[col]

            #get booleans
            expanded_min_in_fit = self.col_has_expanded_min_[col]
            expanded_max_in_fit = self.col_has_expanded_max_[col]
            is_nogotea_col = self.col_is_nogotea_.get(col, False)

            # initialize list for columns
            n = len(ser)
            col_min = np.full(n, np.nan, dtype=float)
            col_max = np.full(n, np.nan, dtype=float)
            col_NG = np.zeros(n, dtype=int)

            # ---------------- No Gotea ----------------
            mask_ng = (types == "nogotea")
            if is_nogotea_col:
                col_min[mask_ng] = 0.0
                col_max[mask_ng] = 0.0
                col_NG[mask_ng] = 1

            # ---------------- Punctual numbers ----------------
            mask_num = (types == "num") & np.isfinite(nums)
            col_min[mask_num] = nums[mask_num]
            col_max[mask_num] = nums[mask_num]

            # ---------------- Ranges ----------------
            mask_range = (types == "range")
            col_min[mask_range] = range_mins[mask_range]
            col_max[mask_range] = range_maxs[mask_range]

            # ---------------- <x values  ----------------
            mask_lt = (types == "lt") & np.isfinite(nums)
            if mask_lt.any():
                # For <x values: min = global_min, max = global_max
                col_min[mask_lt] = global_min
                col_max[mask_lt] = global_max
                
                # Check if  expand LOWER bound further
                if self.margin > 0:
                    vals_lt = nums[mask_lt]
                    min_lt_value = float(vals_lt.min())
                    
                    # Expand downward from the smallest < value
                    expansion_amount = self.margin * abs(min_lt_value) if min_lt_value != 0 else self.margin
                    proposed_min = min_lt_value - expansion_amount
                    
                    #expand if smaller
                    if proposed_min < global_min:
                        old = global_min
                        pending_new_min[col] = proposed_min

                        #log new expancion expansion
                        expansion_reason = f"due to new < values (actual < {sorted(set(vals_lt))}) in transform"
                        self._log_expansion("transform", col, "min", old, proposed_min, expansion_reason)

            # -------------------- >x values --------------------------------
            mask_gt = (types == "gt") & np.isfinite(nums)
            if mask_gt.any():
                # For >x values: min = global_min, max = global_max
                col_min[mask_gt] = global_min
                col_max[mask_gt] = global_max
                
                # Check if expand UPPER bound further
                if self.margin > 0:
                    vals_gt = nums[mask_gt]
                    max_gt_value = float(vals_gt.max())
                    
                    # Expand upward from the largest > value
                    expansion_amount = self.margin * abs(max_gt_value) if max_gt_value != 0 else self.margin
                    proposed_max = max_gt_value + expansion_amount
                    
                    #expand if bigger
                    if proposed_max > global_max:
                        old = global_max
                        pending_new_max[col] = proposed_max

                        #log new expancion expansion
                        expansion_reason = f"due to new > values (actual > {sorted(set(vals_gt))}) in transform"
                        self._log_expansion("transform", col, "max", old, proposed_max, expansion_reason)

            # collect outputs
            out[f"{col}_min"] = col_min
            out[f"{col}_max"] = col_max
            if is_nogotea_col:
                out[f"{col}_No_Gotea"] = col_NG

        # ---------------- Update stored limits if expansions occurred ----------------
        for col, new_min in pending_new_min.items():
            self.col_global_min_[col] = float(new_min)
            self.col_has_expanded_min_[col] = True

        for col, new_max in pending_new_max.items():
            self.col_global_max_[col] = float(new_max)
            self.col_has_expanded_max_[col] = True

        # ---------------- RETURN FULL DATASET ----------------
        X_dropped = X.drop(columns=cols)
        transformed = pd.DataFrame(out, index=X.index)
        final = pd.concat([X_dropped, transformed], axis=1)

        return final

    # ----------------------------------------------------------------------
    #
    #                       EXPANSION SUMMARY PUBLIC
    #
    # ----------------------------------------------------------------------
    def get_expansion_summary(self):
        """Returns a summary of all expansions that occurred"""
        if not self.expansion_log:
            return "No expansions occurred."
        
        summary = "EXPANSION SUMMARY:\n"
        summary += "=" * 80 + "\n"
        
        for i, exp in enumerate(self.expansion_log, 1):
            summary += f"{i}. Stage: {exp['stage'].upper()}, "
            summary += f"Column: {exp['column']}, "
            summary += f"Limit: {exp['limit_type'].upper()}, "
            summary += f"Old: {exp['old_value']} → New: {exp['new_value']}, "
            summary += f"Margin: {exp['margin']}, "
            summary += f"Reason: {exp['reason']}\n"
        
        return summary

In [4]:

df2 = pd.read_csv("../data/datos_grasas_Tec.csv", encoding="latin1")

In [7]:
df2["Punto de Gota, °C"].unique()

array(['304', '300', '260', '280', '290', '295', 'No Gotea', '235', '110',
       '255', '270', '140', '190', '230', '250', '200', '> 230'],
      dtype=object)

In [8]:

import pandas as pd
import numpy as np
import os
os.chdir('..')

from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity

import streamlit as st
import plotly.express as px

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scripts.Recomendador_coseno import reccoseno
from scripts.transformers_JuanPablo import CheckColumnNames,UnknownToZero,FixRanges
from scripts.transformer_Alfredo import FillNaNsWithCeros
from scripts.transformers_Demian import OneHotCodificador
from scripts.transformer_Gonzalo import VectorizarTexto

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 41 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   idDatosGrasas                             51 non-null     int64  
 1   codigoGrasa                               51 non-null     object 
 2   Aceite Base                               47 non-null     object 
 3   Espesante                                 45 non-null     object 
 4   Grado NLGI Consistencia                   51 non-null     float64
 5   Viscosidad del Aceite Base a 40°C. cSt    50 non-null     float64
 6   Penetración de Cono a 25°C, 0.1mm         51 non-null     object 
 7   Punto de Gota, °C                         51 non-null     object 
 8   Estabilidad Mecánica, %                   48 non-null     object 
 9   Punto de Soldadura Cuatro Bolas, kgf      50 non-null     float64
 10  Desgaste Cuatro Bolas, mm               

In [10]:
range_columns = [
    "Punto de Gota, °C",
    "Estabilidad Mecánica, %",
    "Carga Timken Ok, lb",
    "Resistencia al Lavado por Agua a 80°C, %"
]


categorical_columns = ["Aceite Base","Espesante","Clasificacion ISO 6743-9","color","textura"]

preprocessor = Pipeline(steps=[
    ("To have columns names needed", CheckColumnNames()),
    ("To change unkown data to zeros", UnknownToZero("Grado NLGI Consistencia")),
    ("To fix ranges and single values", FixRanges("Penetración de Cono a 25°C, 0.1mm")),

    # NEW TRANSFORMER HERE
    ("DefineLimits", DefineLimits(columns=range_columns, margin=0.0, verbose = True)),

    ("OneHot_categoricals", OneHotCodificador(columns=categorical_columns, drop_original=True, dtype=int)),
    ("To fill NaNs with zeros", FillNaNsWithCeros()),
    ("Vectorizar subtitulo", VectorizarTexto("subtitulo")),
    ("Vectorizar descripcion", VectorizarTexto("descripcion")),
    ("Vectorizar beneficios", VectorizarTexto("beneficios")),
    ("Vectorizar aplicaciones", VectorizarTexto("aplicaciones")),
    ('MinMax', ColumnTransformer(transformers=[('MinMax', MinMaxScaler(), slice(1,None))]))
])


In [11]:
preprocessor

0,1,2
,steps,"[('To have columns names needed', ...), ('To change unkown data to zeros', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,column,'Grado NLGI Consistencia'

0,1,2
,column,"'Penetración de Cono a 25°C, 0.1mm'"

0,1,2
,columns,"['Punto de Gota, °C', 'Estabilidad Mecánica, %', ...]"
,margin,0.0
,verbose,True

0,1,2
,columns,"['Aceite Base', 'Espesante', ...]"
,drop_original,True
,dtype,<class 'int'>
,prefix_sep,'_'

0,1,2
,column,'subtitulo'

0,1,2
,column,'descripcion'

0,1,2
,column,'beneficios'

0,1,2
,column,'aplicaciones'

0,1,2
,transformers,"[('MinMax', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [12]:
DL = DefineLimits(columns=range_columns, margin=1, verbose=True)


In [13]:
mask = df2[range_columns].apply(
    lambda col: pd.to_numeric(col, errors='coerce')
).isna().any(axis=1)

df_non_numeric = df2[mask]

In [14]:
df_non_numeric

Unnamed: 0,idDatosGrasas,codigoGrasa,Aceite Base,Espesante,Grado NLGI Consistencia,Viscosidad del Aceite Base a 40°C. cSt,"Penetración de Cono a 25°C, 0.1mm","Punto de Gota, °C","Estabilidad Mecánica, %","Punto de Soldadura Cuatro Bolas, kgf",...,Factor de Velocidad,"Temperatura de Servicio °C, min","Temperatura de Servicio °C, max",categoria,subtitulo,descripcion,beneficios,aplicaciones,color,textura
2,3,Grasa_3,Mineral HT,Complejo Sulfonato de Calcio,2.0,460.0,265 - 295,300,10,500.0,...,,-10,150,Grasa,Grasa para lubricaciÃ³n de equipo pesado.,El producto es una grasa lubricante de gran ad...,Extremo soporte de carga.\n@Alto contenido de ...,LubricaciÃ³n centralizada de equipo pesado.\n@...,Negro,Adherente
3,4,Grasa_4,Mineral HT,Complejo Sulfonato de Calcio,2.0,220.0,266 - 295,300,5,500.0,...,,-15,150,Grasa,Grasa lubricante para condiciones extremas.,El producto es una grasa lubricante elaborada ...,Alta estabilidad tÃ©rmica y anti-oxidante.\n@E...,Rodamientos expuestos a alta temperatura y car...,Ambar,Suave
6,7,Grasa_7,Mineral HT,Complejo de Aluminio,1.0,3800.0,320,280,,500.0,...,,-5,120,Grasa,Grasa lubricante especial para cables.,El producto es una grasa especial para engrana...,Excelentes caracterÃ­sticas tÃ©rmicas.\n@Excel...,Se recomienda en la lubricaciÃ³n de engranajes...,Negro,Adherente
7,8,Grasa_8,Mineral HT,Complejo Sulfonato de Calcio,0.0,800.0,370,304,1.6,900.0,...,300000.0,-30,150,Grasa,Grasa para lubricaciÃ³n de equipo pesado.,El producto es una grasa lubricante de gran ad...,Extremo soporte de carga.\n@Alto contenido de ...,LubricaciÃ³n centralizada de equipo pesado.\n@...,Negro,Adherente
14,15,Grasa_15,Mineral,Complejo Sulfonato de Calcio,2.0,700.0,266 - 295,290,< 10,315.0,...,,-15,150,Grasa,Grasa lubricante para condiciones extremas.,El producto es una grasa lubricante elaborada ...,Alta estabilidad tÃ©rmica y anti-oxidante.\n@E...,Rodamientos expuestos a alta temperatura y car...,Beige,Adherente
15,16,Grasa_16,,,2.0,,270,295,5,,...,150000.0,-10,150,Grasa,Grasa lubricante sellante para vÃ¡lvulas de se...,El producto es una grasa de efecto sellante co...,Resistencia a gas natural y gas L.P.\n@Gran ca...,"Por su acciÃ³n deslizante, lubricante y sellan...",Ambar,Suave Adhesiva
17,18,Grasa_18,,Silica,2.0,680.0,280,No Gotea,8,800.0,...,,5,210,Grasa,Grasa lubricante para rodamientos de peletizad...,El producto es una grasa lubricante que posee ...,Alta estabilidad tÃ©rmica y antioxidante.\n@Ex...,Especialmente diseÃ±ada para la lubricaciÃ³n d...,Beige,Adherente
19,20,Grasa_20,Mineral HT,Complejo de Litio,1.2,400.0,290 - 310,235,,250.0,...,,-25,150,Grasa,Grasa adherente para mecanismos automotrices.,El producto Grasa Lubricante con gran adheren...,AdhesiÃ³n extrema a elementos mecÃ¡nicos.\n@Al...,Mecanismos expuestos a medio ambiente.\n@Rodam...,Beige Claro,Adherente
20,21,Grasa_21,Mineral,Complejo de Aluminio,0.0,220.0,460,110,,315.0,...,300000.0,-5,160,Grasa,Grasa especial para la industria alimenticia y...,El producto es una grasa lubricante para aplic...,El producto tiene excelente desempeÃ±o en la l...,El producto estÃ¡ especialmente diseÃ±ada para...,Blanco,Suave
28,29,Grasa_28,Mineral HT,Complejo Sulfonato de Calcio,0.0,3500.0,365,140,1.6,620.0,...,,-5,150,Grasa,Lubricante especial para engranes de palas mec...,El producto es un lubricante especial para sis...,Alta polaridad a la superficie de los engrane....,Es recomendado para la lubricaciÃ³n de engrane...,Negro,Viscosa adherente


In [15]:
DL.fit(df_non_numeric)

FITTING STAGE
[FIT] Punto de Gota, °C: MAX EXPANDED 304.0 → 460.0 (margin=1.0) due to > values (actual > [230.0])
[FIT] Punto de Gota, °C: Final limits = [110.0, 460.0]
[FIT] Punto de Gota, °C: Type counts: {'num': 11, 'gt': 3, 'nogotea': 1}
[FIT] Estabilidad Mecánica, %: MIN EXPANDED 1.6 → 0.0 (margin=1.0) due to < values (actual < [10.0, 15.0])
[FIT] Estabilidad Mecánica, %: Final limits = [0.0, 15.0]
[FIT] Estabilidad Mecánica, %: Type counts: {'num': 8, 'lt': 4, 'invalid': 3}
[FIT] Carga Timken Ok, lb: MAX EXPANDED 70.0 → 80.0 (margin=1.0) due to > values (actual > [40.0])
[FIT] Carga Timken Ok, lb: Final limits = [40.0, 80.0]
[FIT] Carga Timken Ok, lb: Type counts: {'num': 7, 'invalid': 5, 'gt': 3}
[FIT] Resistencia al Lavado por Agua a 80°C, %: MIN EXPANDED 1.0 → 0.0 (margin=1.0) due to < values (actual < [3.0, 5.0])
[FIT] Resistencia al Lavado por Agua a 80°C, %: Final limits = [0.0, 5.0]
[FIT] Resistencia al Lavado por Agua a 80°C, %: Type counts: {'invalid': 7, 'num': 4, 'lt':

0,1,2
,columns,"['Punto de Gota, °C', 'Estabilidad Mecánica, %', ...]"
,margin,1.0
,verbose,True


In [16]:
final = DL.transform(df_non_numeric)


TRANSFORMING STAGE


In [20]:
summary = DL.get_expansion_summary()
print(summary)


EXPANSION SUMMARY:
1. Stage: FIT, Column: Punto de Gota, °C, Limit: MAX, Old: 304.0 → New: 460.0, Margin: 1.0, Reason: due to > values (actual > [230.0])
2. Stage: FIT, Column: Estabilidad Mecánica, %, Limit: MIN, Old: 1.6 → New: 0.0, Margin: 1.0, Reason: due to < values (actual < [10.0, 15.0])
3. Stage: FIT, Column: Carga Timken Ok, lb, Limit: MAX, Old: 70.0 → New: 80.0, Margin: 1.0, Reason: due to > values (actual > [40.0])
4. Stage: FIT, Column: Resistencia al Lavado por Agua a 80°C, %, Limit: MIN, Old: 1.0 → New: 0.0, Margin: 1.0, Reason: due to < values (actual < [3.0, 5.0])



In [None]:
DL.fit(df2)

In [None]:
df_transformed = DL.transform(df2)

In [None]:
df_transformed.tail()

In [None]:
import pandas as pd

dfA = pd.DataFrame({
    "Punto": ["<10", ">50", "30"]
})

dfB = pd.DataFrame({
    "Punto": [">60", "<5", "40"]
})

DL = DefineLimits(columns=["Punto"], margin=0.0, verbose=True)

DL.fit(dfA)

In [None]:

preprocessor = Pipeline(steps=[
    ("To have columns names needed", CheckColumnNames()),
    ("To change unkown data to zeros", UnknownToZero("Grado NLGI Consistencia")),
    ("To fix ranges and single values", FixRanges("Penetración de Cono a 25°C, 0.1mm")),

    # NEW TRANSFORMER HERE
    ("DefineLimits", DefineLimits(columns=range_columns, margin=0.0, verbose = True)),

    ("OneHot_categoricals", OneHotCodificador(columns=categorical_columns, drop_original=True, dtype=int)),
    ("To fill NaNs with zeros", FillNaNsWithCeros()),
    ("Vectorizar subtitulo", VectorizarTexto("subtitulo")),
    ("Vectorizar descripcion", VectorizarTexto("descripcion")),
    ("Vectorizar beneficios", VectorizarTexto("beneficios")),
    ("Vectorizar aplicaciones", VectorizarTexto("aplicaciones")),
    ('MinMax', ColumnTransformer(transformers=[('MinMax', MinMaxScaler(), slice(1,None))]))
])

In [None]:

for name, step in preprocessor.steps:
    print(f"Running step: {name}")
    try:
        X = step.fit_transform(df2)
    except Exception as e:
        print(f"\n❌ Error occurred in step: {name}")
        raise e

In [None]:
dfA.shape