In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# v 1.1 - Cross-Validation = 34.11 / Previsão Final = 40.20

# Carregamento dos dados
data_treino = pd.read_csv('/home/caio/github/Aurora/config/csv_create/train.csv')
data_prev = pd.read_csv('/home/caio/github/Aurora/config/csv_create/test.csv')

class Aurora:

    def __init__(self, data_treino, data_previsao):
        self.data_treino = data_treino                                                             # Carga DataFrame Treino
        self.data_previsao = data_previsao                                                         # Carga DataFrame Previsão (Previsão = feature)
        self.target, self.feature_categorical, self.feature_numerical, self.previsao_categorical, self.previsao_numerical = self.splitter(data_treino, data_previsao)

        # Tratemento Categórico
        #self.feature_categorical_encoded = self.one_hot(self.feature_categorical)
        #self.previsao_categorical_encoded = self.one_hot(self.previsao_categorical)
        self.feature_categorical_encoded = self.remap(self.feature_categorical)
        self.previsao_categorical_encoded = self.remap(self.previsao_categorical)

        # Tratamento Numerico
        self.feature_numerical_encoded = self.missing_numerical(self.feature_numerical)
        self.previsao_numerical_encoded = self.missing_numerical(self.previsao_numerical)
        
        # Juntando data Categorica(tratada) e data numérica
        self.feature_completed = self.fusion(self.feature_categorical_encoded, self.feature_numerical_encoded)
        self.previsao_completed = self.fusion(self.previsao_categorical_encoded, self.previsao_numerical_encoded)

        # Treinando Modelo
        self.treino_final = self.treino(self.feature_completed, self.target)

        # Fazendo Previsão
        self.prev = self.previsao(self.previsao_completed)

        # Salvando Arquivo
        self.impressao(self.prev)

        print('Aurora Finalizada Com Sucesso!')


    def splitter(self, data_treino, data_previsao):
        feature = data_treino.drop(columns=['Price'])
        target = data_treino['Price']
        feature_categorical = feature.select_dtypes(include=['object', 'category'])
        feature_numerical = feature.select_dtypes(include=['int64', 'float64'])
        previsao_categorical = data_previsao.select_dtypes(include=['object', 'category'])
        previsao_numerical = data_previsao.select_dtypes(include=['int64', 'float64'])

        return target, feature_categorical, feature_numerical, previsao_categorical, previsao_numerical

    def missing_numerical(self, data):
        return data.fillna(0)

    def remap(self, data):
        self.brand_remap = {
            'Jansport': [0,0,0,0,1],
            'Under Armour': [0,0,0,1,0],
            'Nike': [0,0,1,0,0],
            'Adidas': [0,1,0,0,0],
            'Puma': [1,0,0,0,0],
            np.nan: [0,0,0,0,0]
        }

        self.material_remap = {
            'Leather': [0,0,0,1],
            'Canvas': [0,0,1,0],
            'Nylon': [0,1,0,0],
            'Polyester': [1,0,0,0],
            np.nan: [0,0,0,0]
        }

        self.size_remap = {
            'Small': 1,
            'Medium': 2,
            'Large': 3,
            np.nan : 0
        }

        self.laptop_remap = {
            'Yes': 2,
            'No':1,
            np.nan:0
        }

        self.waterproof_remap = {
            'Yes': 2,
            'No': 1,
            np.nan : 0
        }

        self.style_remap = {
            'Tote': [0,0,1],
            'Messenger': [0,1,0],
            'Backpack': [1,0,0],
            np.nan: [0,0,0]
        }

        self.color_remap = {
            'Black': [0,0,0,0,0,1],
            'Green': [0,0,0,0,1,0],
            'Red': [0,0,0,1,0,0],
            'Blue': [0,0,1,0,0,0],
            'Gray': [0,1,0,0,0,0],
            'Pink': [1,0,0,0,0,0],
            np.nan: [0,0,0,0,0,0]
        }
        # Aplica os mapeamentos sem ordem de importância
        brand_columns = ['Brand_1', 'Brand_2', 'Brand_3', 'Brand_4', 'Brand_5']
        material_columns = ['Material_1', 'Material_2', 'Material_3', 'Material_4']
        style_columns = ['Style_1', 'Style_2', 'Style_3']
        color_columns = ['Color_1', 'Color_2', 'Color_3', 'Color_4', 'Color_5', 'Color_6']

        data[brand_columns] = data['Brand'].map(self.brand_remap).apply(pd.Series)
        data[material_columns] = data['Material'].map(self.material_remap).apply(pd.Series)
        data[style_columns] = data['Style'].map(self.style_remap).apply(pd.Series)
        data[color_columns] = data['Color'].map(self.color_remap).apply(pd.Series)

        # Aplica os mapeamentos com peso ordinal
        data['Size'] = data['Size'].map(self.size_remap)
        data['Laptop Compartment'] = data['Laptop Compartment'].map(self.laptop_remap)
        data['Waterproof'] = data['Waterproof'].map(self.waterproof_remap)

    # Remove colunas originais
        data = data.drop(columns=['Brand', 'Material', 'Style', 'Color', 'Waterproof'])

        return data

    def one_hot(self, data):
        return pd.get_dummies(data)

    def fusion(self, data1, data2):
        return data1.join(data2, how='left')

    def treino(self, feature, target):
        self.model = XGBRegressor()
        scores = -1 * cross_val_score(self.model, feature, target,
                              cv=5,
                              scoring='neg_mean_absolute_error')

        self.model.fit(feature, target)

        print("Average MAE score:")
        print(scores.mean())

    def previsao(self, data):
        if self.model is None:
            raise ValueError("O Modelo Não Foi Treinado!")

        previsao = self.model.predict(data)

        return previsao

    def impressao(self, previsao):
        resultado = pd.DataFrame({'id': self.data_previsao['id'], 'Price_Prediction': previsao})
        resultado.to_csv('/home/caio/github/Aurora/config/csv_previsao/aurora_v1_2.csv', index=False)
        print("Previsão salva com sucesso!")


# Instanciando a classe Aurora
teste = Aurora(data_treino, data_prev)


Average MAE score:
34.102365226920334
Previsão salva com sucesso!
Aurora Finalizada Com Sucesso!
