In [1]:
import os
os.chdir('..')

In [2]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from scripts.transformers_JuanPablo import CheckColumnNames,UnknownToZero,FixRanges
from scripts.transformer_Alfredo import FillNaNsWithCeros
from scripts.transformers_Demian import OneHotCodificador
from scripts.transformer_Gonzalo import VectorizarTexto

In [13]:
data = pd.read_csv("data/datos_grasas_Tec.csv", encoding="latin1")
data_pdfs = pd.read_csv("data/datos_grasas_pdfs.csv", encoding="latin1")

In [14]:
# Categorical columns to be one-hot encoded
categorical_columns = [
    "Aceite Base",
    "Espesante",
    "Clasificacion ISO 6743-9",
    "color",
    "textura"
]

In [15]:
process = Pipeline(steps=[
    ("To have columns names needed", CheckColumnNames()),
    ("To change unkown data to zeros", UnknownToZero("Grado NLGI Consistencia")),
    ("To fix ranges and single values", FixRanges("Penetración de Cono a 25°C, 0.1mm")),
    ("OneHot_categoricals", OneHotCodificador(columns=categorical_columns,drop_original=True,dtype=int)),
    ("To fill NaNs with zeros", FillNaNsWithCeros()),
    ("Vectorizar subtitulo", VectorizarTexto("subtitulo")),
    ("Vectorizar descripcion", VectorizarTexto("descripcion")),
    ("Vectorizar beneficios", VectorizarTexto("beneficios")),
    ("Vectorizar aplicaciones", VectorizarTexto("aplicaciones"))
])

In [None]:
X=process.fit_transform(data)
X.head()

Unnamed: 0,idDatosGrasas,codigoGrasa,Grado NLGI Consistencia,Viscosidad del Aceite Base a 40°C. cSt,"Punto de Gota, °C","Estabilidad Mecánica, %","Punto de Soldadura Cuatro Bolas, kgf","Desgaste Cuatro Bolas, mm",Indice de Carga-Desgaste,"Carga Timken Ok, lb",...,aplicaciones_tfidf_universal,aplicaciones_tfidf_usarse,aplicaciones_tfidf_uso,aplicaciones_tfidf_velocidad,aplicaciones_tfidf_velocidades,aplicaciones_tfidf_versiã³n,aplicaciones_tfidf_vibraciones,aplicaciones_tfidf_vibraciã³n,aplicaciones_tfidf_vã,aplicaciones_tfidf_ã³ptima
0,1,Grasa_1,2.0,680.0,304,1.07,500.0,0.0,0.0,60,...,0.0,0.186297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222519
1,2,Grasa_2,1.5,460.0,304,2.6,900.0,0.5,166.0,70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220561,0.0,0.0
2,3,Grasa_3,2.0,460.0,300,10.0,500.0,0.48,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220561,0.0,0.0
3,4,Grasa_4,2.0,220.0,300,5.0,500.0,0.45,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Grasa_5,1.5,680.0,260,6.5,620.0,0.48,0.0,50,...,0.0,0.0,0.0,0.0,0.198876,0.0,0.0,0.131806,0.0,0.0


In [17]:
Y=process.transform(data_pdfs)
Y.head()

Unnamed: 0,idDatosGrasas,codigoGrasa,Grado NLGI Consistencia,Viscosidad del Aceite Base a 40°C. cSt,"Punto de Gota, °C","Estabilidad Mecánica, %","Punto de Soldadura Cuatro Bolas, kgf","Desgaste Cuatro Bolas, mm",Indice de Carga-Desgaste,"Carga Timken Ok, lb",...,aplicaciones_tfidf_universal,aplicaciones_tfidf_usarse,aplicaciones_tfidf_uso,aplicaciones_tfidf_velocidad,aplicaciones_tfidf_velocidades,aplicaciones_tfidf_versiã³n,aplicaciones_tfidf_vibraciones,aplicaciones_tfidf_vibraciã³n,aplicaciones_tfidf_vã,aplicaciones_tfidf_ã³ptima
0,52,ANDEROL FGCS-2,2.0,95.0,318,0.0,407.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,53,DeloÂ® Grease EP 00 235212,0.0,226.0,0,0.0,315.0,0.45,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,54,DeloÂ® Grease EP 0 235211,0.0,226.0,235,0.0,315.0,0.45,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,55,DeloÂ® Grease EP 1 235209,1.0,226.0,245,0.0,315.0,0.45,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,56,DeloÂ® Grease EP 2 235208,2.0,226.0,255,0.0,315.0,0.45,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
