In [64]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [65]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

In [67]:
df_diamonds = pd.read_csv('datasets/diamonds.csv')
df_diamonds["price"] = df_diamonds["price"].astype(float)
#df_diamonds["Unnamed: 0"] = df_diamonds["Unnamed: 0"].astype(float)

In [68]:
#outliers
#df_diamonds = df_diamonds[df_diamonds.y < 11]
#df_diamonds = df_diamonds[df_diamonds.z < 11]
#df_diamonds = df_diamonds[df_diamonds.z > 1]
#df_diamonds.drop(['x','y','z'], axis=1, inplace=True)

In [69]:
X = df_diamonds.drop('price', axis=1)
y = df_diamonds['price']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2, random_state=1)

In [71]:
order = {
    "cut": ["Ideal","Premium", "Very Good", "Good", "Fair"],
    "color": ["D","E","F","G","H","I","J"],
    "clarity": ["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"]
}

In [72]:
atributos_numericos = X_train.select_dtypes('float').columns.tolist()
transformer_numerico = StandardScaler()
#transformer_numerico = MinMaxScaler()

atributos_categoricos = X_train.select_dtypes('O').columns.tolist()
#transformer_categorico = OneHotEncoder(handle_unknown='ignore')
categorias = [["Fair","Good", "Very Good", "Premium", "Ideal"],
              ["J","I","H","G","F","E","D"],
              ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]]

transformer_categorico = OrdinalEncoder(categories=categorias)

#selecao_atributos = SelectFromModel(DecisionTreeRegressor(random_state=42),threshold=0.001, max_features=7)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numerico, atributos_numericos),
        ('cat', transformer_categorico, atributos_categoricos)])


pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
#pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('selecao_atributos', selecao_atributos)])

In [73]:
pipeline = pipeline.fit(X_train, y_train)

In [74]:
X_train.shape

(43152, 10)

In [75]:
X_train = pipeline.transform(X_train)

In [76]:
X_train.shape

(43152, 9)

In [77]:
X_test = pipeline.transform(X_test)

In [78]:
dataset_1 = (X_train, X_test, y_train, y_test, 'dataset_1')

In [79]:
model_name=[]
model_=[]
cv_score_test=[]
cv_score_train=[]
mse_=[]
mae_=[]
rmse_=[]
r2_=[]

In [80]:
scoring = {'r_2': 'r2',
           'mae': 'neg_mean_absolute_error',
           'rmse': 'neg_root_mean_squared_error'}

In [81]:
def run_model(model, dataset, modelname):
    
    #model = TransformedTargetRegressor(regressor=model, func=np.log, inverse_func=np.exp)
    
    scores = cross_validate(estimator=model,
                            X=dataset[0],
                            y=dataset[2],
                            scoring=scoring,
                            return_train_score=True,
                            return_estimator=True,
                            cv=10, verbose=1, n_jobs=-1)
    
    print(f"#### {modelname} ####")
    print('')
    print('CV-R2    : %0.4f' %np.mean(scores['test_r_2']))
    print('CV-MAE   : %0.4f' %np.mean(-scores['test_mae']))
    print('CV-RMSE  : %0.4f' %np.mean(-scores['test_rmse']))
    
    print(scores['test_r_2'])
    print(scores['test_mae'])
    print(scores['test_rmse'])
    
    model.fit(dataset[0], dataset[2])
    y_pred = model.predict(dataset[1])
    
    mse = mean_squared_error(dataset[3], y_pred)
    mae = mean_absolute_error(dataset[3], y_pred)
    rmse = mean_squared_error(dataset[3], y_pred)**0.5
    r2 = r2_score(dataset[3], y_pred)
    
    print('')
    #print('MSE     : %0.4f ' % mse)
    print('TEST-R2    : %0.4f ' % r2)
    print('TEST-MAE   : %0.4f ' % mae)
    print('TEST-RMSE  : %0.4f ' % rmse)
    
    return 0
    print('')
    score_1=model.score(dataset[1], dataset[3])
    print(f'#### {modelname} ####')
    print("score :%.4f" %score_1)
    print(accuracies)
    
    
    ## appending to the lists
    
    model_name.append(modelname)
    model_.append(model)
    cv_score_test.append(score_1)
    cv_score_train.append(np.mean(accuracies))
    mse_.append(mse)
    mae_.append(mae)
    rmse_.append(rmse)
    r2_.append(r2)

In [87]:
from xgboost import XGBRegressor
model_dict={#'LinearRegression': LinearRegression(), 
            #'LassoRegression': Lasso(normalize=True),
            #'RidgeRegression': Ridge(normalize=True),
            #'DecisionTreeRegressor': DecisionTreeRegressor(),
            #'GradientBoostingRegressor': GradientBoostingRegressor(),
            'XGBRegressor': XGBRegressor(),
            'RandomForestRegressor': RandomForestRegressor(),
            #'KNeighborsRegressor': KNeighborsRegressor(),
            #'MLPRegressor': MLPRegressor(hidden_layer_sizes=(200, 100, 50), batch_size=50, learning_rate_init=0.001, learning_rate="adaptive", max_iter=700, verbose=True)
           }

In [88]:
for models in model_dict:
    run_model(model_dict[models], dataset_1, models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.1min finished


#### XGBRegressor ####

CV-R2    : 0.9811
CV-MAE   : 279.6658
CV-RMSE  : 549.1038
[0.98042584 0.98242163 0.98082933 0.97979402 0.98115284 0.98179986
 0.98002647 0.98131422 0.98306997 0.98031772]
[-266.64376848 -273.9706958  -280.83687528 -291.62142789 -283.44069314
 -287.85279388 -269.4999746  -278.13463623 -282.54513816 -282.11233354]
[-537.85023014 -523.78653688 -548.00430722 -578.49354318 -558.77592848
 -559.05594881 -538.55330309 -550.28389683 -534.13434324 -562.10010151]

TEST-R2    : 0.9824 
TEST-MAE   : 272.1734 
TEST-RMSE  : 523.0895 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


#### RandomForestRegressor ####

CV-R2    : 0.9811
CV-MAE   : 271.2239
CV-RMSE  : 549.4987
[0.98077504 0.98128271 0.98113321 0.97984357 0.98299078 0.98151405
 0.97958925 0.98079019 0.98317416 0.97969455]
[-256.12455064 -270.99881764 -271.31563677 -285.56930063 -267.8658816
 -280.26934277 -259.0494674  -269.80821472 -272.94261246 -278.29544704]
[-533.03098956 -540.48863139 -543.64364103 -577.78384636 -530.83169818
 -563.42845535 -544.41588817 -557.94668703 -532.48826244 -570.92919362]

TEST-R2    : 0.9817 
TEST-MAE   : 263.7529 
TEST-RMSE  : 532.6543 


### NECESSÁRIO ENTENDER SE FAZ SENTIDO USAR LOG