In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [3]:
df_diamonds = pd.read_csv('datasets/diamonds.csv')
df_diamonds.drop(columns={"Unnamed: 0"}, inplace=True)
df_diamonds["price"] = df_diamonds["price"].astype(float)

In [4]:
X = df_diamonds.drop('price', axis=1)
X['carat'] = np.log2(X['carat'])
y = np.log2(df_diamonds['price'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2, random_state=1)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
order = {
    "cut": ["Ideal","Premium", "Very Good", "Good", "Fair"],
    "color": ["D","E","F","G","H","I","J"],
    "clarity": ["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"]
}

In [8]:
atributos_numericos = X_train.select_dtypes('float').columns.tolist()
transformer_numerico = StandardScaler()

atributos_categoricos = X_train.select_dtypes('O').columns.tolist()
#transformer_categorico = OneHotEncoder(handle_unknown='ignore')
categorias = [["Fair","Good", "Very Good", "Premium", "Ideal"],
              ["J","I","H","G","F","E","D"],
              ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]]

transformer_categorico = OrdinalEncoder()

selecao_atributos = SelectFromModel(DecisionTreeRegressor(random_state=42),threshold=0.001, max_features=9)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numerico, atributos_numericos),
        ('cat', transformer_categorico, atributos_categoricos)])


pipeline = Pipeline(steps=[('preprocessor', preprocessor)])#, ('selecao_atributos', selecao_atributos)])

In [9]:
seletor = pipeline.fit(X_train, y_train)

In [10]:
X_train = pipeline.transform(X_train)

In [11]:
X_test = pipeline.transform(X_test)

In [12]:
dataset_1 = (X_train, X_test, y_train, y_test, 'dataset_1')

In [13]:
model_name=[]
model_=[]
cv_score_test=[]
cv_score_train=[]
mse_=[]
mae_=[]
rmse_=[]
r2_=[]

In [14]:
def run_model(model, dataset, modelname):
    model.fit(dataset[0], dataset[2])
    accuracies=cross_val_score(estimator=model, X=dataset[0], y=dataset[2], cv=5, verbose=1, n_jobs=-1)
    y_pred=model.predict(dataset[1])
    print('')
    score_1=model.score(dataset[1], dataset[3])
    print(f'#### {modelname} ####')
    print("score :%.4f" %score_1)
    print(accuracies)
    
    
    mse=mean_squared_error(dataset[3], y_pred)
    mae=mean_absolute_error(dataset[3], y_pred)
    rmse=mean_squared_error(dataset[3], y_pred)**0.5
    r2=r2_score(dataset[3], y_pred)
    
    print('')
    print('MSE    : %0.2f ' % mse)
    print('MAE    : %0.2f ' % mae)
    print('RMSE   : %0.2f ' % rmse)
    print('R2     : %0.2f ' % r2)
    
    ## appending to the lists
    
    model_name.append(modelname)
    model_.append(model)
    cv_score_test.append(score_1)
    cv_score_train.append(np.mean(accuracies))
    mse_.append(mse)
    mae_.append(mae)
    rmse_.append(rmse)
    r2_.append(r2)

In [15]:
model_dict={'DecisionTreeRegressor': DecisionTreeRegressor(),
            'AdaBoostRegressor': AdaBoostRegressor(n_estimators=1000),
            'GradientBoostingRegressor': GradientBoostingRegressor(),
            'RandomForestRegressor': RandomForestRegressor(),
            'KNeighborsRegressor': KNeighborsRegressor(),
            'MLPRegressor': MLPRegressor(hidden_layer_sizes=(200, 100,), batch_size=100, learning_rate_init=0.004, learning_rate="adaptive", max_iter=700, verbose=True)
           }

In [16]:
for models in model_dict:
    run_model(model_dict[models], dataset_1, models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished



#### DecisionTreeRegressor ####
score :0.9850
[0.98458885 0.98312925 0.98488688 0.98324379 0.98382069]

MSE    : 0.03 
MAE    : 0.12 
RMSE   : 0.18 
R2     : 0.99 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished



#### AdaBoostRegressor ####
score :0.9322
[0.93350074 0.9329071  0.93594907 0.93793159 0.93226008]

MSE    : 0.14 
MAE    : 0.30 
RMSE   : 0.38 
R2     : 0.93 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.4s finished



#### GradientBoostingRegressor ####
score :0.9875
[0.98680034 0.98664751 0.98831835 0.98622824 0.98686759]

MSE    : 0.03 
MAE    : 0.13 
RMSE   : 0.16 
R2     : 0.99 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   42.9s finished



#### RandomForestRegressor ####
score :0.9921
[0.99168294 0.99148562 0.99249872 0.99055615 0.99120019]

MSE    : 0.02 
MAE    : 0.09 
RMSE   : 0.13 
R2     : 0.99 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished



#### KNeighborsRegressor ####
score :0.9801
[0.97941677 0.97867443 0.97940184 0.97766658 0.97829633]

MSE    : 0.04 
MAE    : 0.15 
RMSE   : 0.20 
R2     : 0.98 
Iteration 1, loss = 0.93449911
Iteration 2, loss = 0.03715466
Iteration 3, loss = 0.03457328
Iteration 4, loss = 0.03217330
Iteration 5, loss = 0.02781067
Iteration 6, loss = 0.02781307
Iteration 7, loss = 0.02679945
Iteration 8, loss = 0.02753616
Iteration 9, loss = 0.02786726
Iteration 10, loss = 0.02830597
Iteration 11, loss = 0.04626015
Iteration 12, loss = 0.02747418
Iteration 13, loss = 0.02420441
Iteration 14, loss = 0.02234041
Iteration 15, loss = 0.02183025
Iteration 16, loss = 0.02255245
Iteration 17, loss = 0.02156146
Iteration 18, loss = 0.02190263
Iteration 19, loss = 0.02268984
Iteration 20, loss = 0.02900792
Iteration 21, loss = 0.02048845
Iteration 22, loss = 0.02210503
Iteration 23, loss = 0.02033056
Iteration 24, loss = 0.02200875
Iteration 25, loss = 0.02724946
Iteration 26, loss = 0.02032974
Iteration 27, 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.9min finished



#### MLPRegressor ####
score :0.9891
[0.98741043 0.98682037 0.98899378 0.98912065 0.98179349]

MSE    : 0.02 
MAE    : 0.11 
RMSE   : 0.15 
R2     : 0.99 


### NECESSÁRIO ENTENDER SE FAZ SENTIDO USAR LOG