In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

# Preparando os dados

In [122]:
columns = ['defect_type','defect_location', 'severity', 'inspection_method', 'repair_cost']
df = pd.read_csv(r'..\dataset\defects_data.csv')
df_selected = df[columns].copy()

In [123]:
encoders = {}

for column in columns:
    encoder = LabelEncoder()
    dtype = df_selected[column].dtype
    if dtype == 'O':
        df_selected[column] = encoder.fit_transform(df_selected[column])
        encoders[column] = encoder

mms = MinMaxScaler()
df_processed = mms.fit_transform(df_selected)
encoders['MinMaxScaler'] = mms

# Realizando treino e avaliando os modelos

In [143]:
X = df_processed[0::,0:4]
y = df_processed[0::, -1]

model_metrics = {}
models = [
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    MLPRegressor(),
    LinearRegression()
]

for model in models:
    n_splits = 4
    kfold = KFold(n_splits=n_splits)

    for i, (train_index, test_index) in enumerate(kfold.split(X)):
        model_name = str(model)[0:-2:1]
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test = encoders['MinMaxScaler'].inverse_transform(np.hstack([X_test, y_test.reshape(-1, 1)]))
        pred =  encoders['MinMaxScaler'].inverse_transform(np.hstack([X_test, y_pred.reshape(-1, 1)]))
        mse = mean_squared_error(test[::, -1], pred[::, -1])
        rmse = (mean_squared_error(test[::, -1], pred[::, -1]))**(1/2)
        model_metrics[model_name] = (mse, rmse)
    print(model_name)


for model, metrics in model_metrics.items():
    print('*-*'*5)
    print(f'{model}')
    print(f'MSE: {metrics[0]}')
    print(f'RMSE {metrics[1]}')

ExtraTreesRegressor
RandomForestRegressor
MLPRegressor
LinearRegression
*-**-**-**-**-*
ExtraTreesRegressor
MSE: 100756.92261934452
RMSE 317.422309580383
*-**-**-**-**-*
RandomForestRegressor
MSE: 100839.27937705551
RMSE 317.55201050702783
*-**-**-**-**-*
MLPRegressor
MSE: 91545.95190990674
RMSE 302.56561587514653
*-**-**-**-**-*
LinearRegression
MSE: 89035.995545486
RMSE 298.3890003761633


# Conclusão
Como ja era de se esperar todos os modelos obtiveram erros semelhantes e relativamente altos, isso é devido a caracterista aleatória dos dados, por mais que eles tenham um certo limite em que o custo de reparo esteja, ainda sim é difícil de se prever.
