In [98]:
import pandas as pd
import numpy as np
from rich import print
import dill

In [99]:
data = pd.read_csv('../../data/structured/all_data.csv')
# train = pd.read_csv('../../data/structured/carbon_train.csv')

In [100]:
data.head()

Unnamed: 0,level,context,message,hora,minuto,segundo
0,ERROR,API_LOGGER._OpenService_,Error while building message. Error while buil...,19,37,10
1,ERROR,API_LOGGER._OpenService_,Error while building message. Error while buil...,19,37,12
2,ERROR,API_LOGGER._OpenService_,Error while building message. Error while buil...,20,12,12
3,ERROR,API_LOGGER._OpenService_,Error while building message. Error while buil...,19,46,13
4,ERROR,java.lang.Class,Access Denied. Failed authorization attempt to...,12,27,22


In [101]:
data.dtypes

level      object
context    object
message    object
hora        int64
minuto      int64
segundo     int64
dtype: object

In [102]:
numerics_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
text_columns = ['message']
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
categorical_columns.remove('message')

In [103]:
print("Numerics columns:", numerics_columns)
print("Text columns:", text_columns)
print("Categorical columns:", categorical_columns)

In [104]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

In [105]:
class TransformerData(BaseEstimator, TransformerMixin):
    def __init__(self, numerics_columns, text_column, categorical_columns, n_components=100):
        self.n_components = n_components
        self.numerics_columns = numerics_columns
        self.text_column = text_column  # deve ser uma string, ex: 'mensagem'
        self.categorical_columns = categorical_columns
        self.numerics_transformer = MinMaxScaler()
        self.text_transformer = TfidfVectorizer()
        self.categorical_transformer = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False)
        self.pca_transformer = PCA(n_components=n_components, random_state=42)

    def fit(self, X, y=None):
        self.numerics_transformer.fit(X[self.numerics_columns])
        self.text_transformer.fit(X[self.text_column].squeeze().astype(str))
        self.categorical_transformer.fit(X[self.categorical_columns])
        return self

    def transform(self, X):
        numerics_transformed = self.numerics_transformer.transform(X[self.numerics_columns])
        text_transformed    = self.text_transformer.transform(X[self.text_column].squeeze().astype(str))
        transformed_pca = self.pca_transformer.fit_transform(text_transformed)
        categorical_transformed = self.categorical_transformer.transform(X[self.categorical_columns])
        
        # aí sim concatene
        transformed_data = np.hstack((
            numerics_transformed,
            transformed_pca,
            categorical_transformed
        ))

        return pd.DataFrame(transformed_data, index=X.index)


In [106]:
transform = TransformerData(n_components=100,numerics_columns=numerics_columns, text_column=text_columns, categorical_columns=categorical_columns)
transform.fit(data)
trns = transform.transform(data)
with open('../../data/files/transformer.dill', 'wb') as f:
    dill.dump(transform, f)
data_trains = pd.read_csv('../../data/structured/carbon_train.csv')
data_trains =  transform.transform(data_trains)
data_trains.to_csv('../../data/processed/train.csv')
data_test = pd.read_csv('../../data/test/carbon_test.csv')
data_test = transform.transform(data_test)
data_test.to_csv('../../data/processed/test.csv', index=False)