## Imports

In [4]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import time
from IPython.display import display, Javascript
import itertools
import duckdb
import pandas as pd
import numpy as np
import math

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

# Data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PowerTransformer

# Train
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Models
import umap
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')




## Functions

#### Functions to correct errors

In [5]:
# Function to drop zeros
def drop_zeros(df):
    df = df.drop(df[df['x'] == 0].index)
    df = df.drop(df[df['y'] == 0].index)
    df = df.drop(df[df['z'] == 0].index)
    return df

"""
# Function to remove outliers
def remove_outliers(df):
    df = df[(df['x'] < 30)]
    df = df[(df['y'] < 30)]
    df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    df = df[(df['table'] < 80) & (df['table'] > 40)]
    df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df
"""

def remove_outliers(df):
    if 'x' in df.columns:
        df = df[df['x'] < 20]
    if 'y' in df.columns:
        df = df[df['y'] < 20]
    if 'z' in df.columns:
        df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    if 'table' in df.columns:
        df = df[(df['table'] < 80) & (df['table'] > 20)]
    if 'depth' in df.columns:
        df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df

# Function to remove duplicates
def remove_duplicates(df):
    df = df.drop_duplicates()
    return df

# Function to impute values:
def imputation(df):
    # Calculate the median of each column
    median_x = df.loc[df['x'] != 0, 'x'].median()
    median_y = df.loc[df['y'] != 0, 'y'].median()
    median_z = df.loc[df['z'] != 0, 'z'].median()

    # Replace values equal to 0 by the corresponding median.
    df['x'] = df['x'].replace(0, median_x)
    df['y'] = df['y'].replace(0, median_y)
    df['z'] = df['z'].replace(0, median_z)
    return df

#### Functions to encode

In [6]:
def encoder(df):
    df_enc = df.copy()

    # Obtain the dataframe encoded
    for column in df.columns:
        if df[column].dtype == 'object':
            enc_label = LabelEncoder()
            df_enc[column] = enc_label.fit_transform(df[column])
    return df_enc

#### Functions to features ingeniering

In [7]:
def feature_ing(df_features):
    #print('Dataframe features: ',df_features.head())
    # Test the depth calculate
    df_features['depth_mm'] = (df_features['z']*2)/(df_features['x'] + df_features['y'])
    # Obtain the average girdle diameter
    df_features['avg_girdle'] = (df_features['z'])/(df_features['depth_mm'])
    # Obtain table in mm
    df_features['table_mm'] = (df_features['avg_girdle'])*(df_features['table'])/100
    # Obtain table*depth
    df_features['table_depth'] = (df_features['table'])/(df_features['depth'])
    # Obtain x, y, z
    df_features['xyz'] = (df_features['x'])*(df_features['y'])*(df_features['z'])
    return df_features

# Function to calculate log
def calculate_log(df, name):
    list_log= []
    for i in df['carat']:
        list_log.append(math.log(i))
    
    new_name = name + '_log'
    df[new_name] = list_log
    return df

# Funtion to classify diamond shape
def classify_shape(df):
    shape = []
    for i in df['table'].index:
        if 54<df['table'][i]<57 and 61<df['depth'][i]<62.5:
            shape.append('Round')
        elif 52<df['table'][i]<60 and 60<df['depth'][i]<68:
            shape.append('Oval')
        elif 63<df['table'][i]<69 and 69<df['depth'][i]<76:
            shape.append('Princess')
        elif 58<df['table'][i]<63 and 58<df['depth'][i]<66:
            shape.append('Cushion')
        else:
            shape.append('others')
            
    df['shape'] = shape
    return df

#### Functions to remove uncorrelated features

In [8]:
# Function to delete features without correlation with price (train data)
def delete_features_train(df):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Find features with correlation greater than 0.90
    to_drop = corr_matrix.columns[corr_matrix['price'] <= 0.1]
    #print(to_drop)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct,to_drop

# Function to delete features without correlation with price (test data)
def delete_features_test(df, to_drop):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct

#### Scaling function

In [9]:
def stardard_scale_test(df):
    X = df.drop('price',axis = 1)
    columns = X.columns
    # Scaler
    scaler = StandardScaler()
    # Scale X
    X_scaled = scaler.fit_transform(X)
    return X_scaled

#### Pipeline function

In [10]:
# PIPELINE 1
# Building the Pipelines

#Linear Regression
lr_pipeline = Pipeline([
    ('scaler_1',StandardScaler()),
    ('lr_classifier',LinearRegression())
])
# knn
knn_pipline =Pipeline([
    ('scaler_2' ,StandardScaler()),
    ('knn_classifier',KNeighborsRegressor())
])

#XGB
xgb_pipeline = Pipeline([
    ('scaler_3', StandardScaler()),
    ('xgb_classifier', XGBRegressor())
])


#Decision Tree
dt_pipeline = Pipeline([
    ('scaler_4', StandardScaler()),
    ('dt_classifier', DecisionTreeRegressor())
])

#Random Forest
rf_pipeline = Pipeline([
    ('scaler_5', StandardScaler()),
    ('rf_classifier', RandomForestRegressor(random_state = 42,
                                            n_jobs=-1))
])

#pipelines = [lr_pipeline,knn_pipline,dt_pipeline,rf_pipeline]   #,xgb_pipeline
#models = ['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest']   #, 'XGB'

pipelines = [rf_pipeline, xgb_pipeline]
models = ['Random Forest', 'XGB']


# PIPELINE 2


features_pipeline = ColumnTransformer([
    ('Drop zeros', drop_zeros),
    ('Remove outliers', remove_outliers),
    ('Remove duplicates', remove_duplicates),
    ('Feature ingeniering', feature_ing),])


preprocessing_pipeline = Pipeline([
    ('Drop zeros', drop_zeros),
    ('Remove outliers', remove_outliers),
    ('Remove duplicates', remove_duplicates),
    ('Feature ingeniering', feature_ing)
])

#### Automation process

In [11]:
def version1_without_scaler2(df, drop_zeros_var, imputation_var, remove_outliers_var, remove_duplicates_var,
                             feature_ing_var, delete_features_var, encoder_var=1):
    # Transformations
    if drop_zeros_var == 1:
        df = drop_zeros(df)   # Drop zeros
        
    if imputation_var ==1:
        df = imputation(df)   # Imputation data
        
    if remove_outliers_var == 1:
        df = remove_outliers(df)   # Remove outliers
        
    if remove_duplicates_var == 1:
        df = remove_duplicates(df)   # Remove duplicates
        
    if encoder_var == 1:
        df = encoder(df)   # Encoding
        
    if feature_ing_var == 1:
        df = feature_ing(df)   # Feature ingeniering
        
    if delete_features_var == 1:
        df, to_drop = delete_features_train(df)   # Drop features
        #print(to_drop)
        
    return df

#### Test different models automatically

In [None]:
result_df = pd.DataFrame()

# Definir las columnas con los nombres deseados
column_names = ['model', 'cv_score', 'prediction', 'drop_zeros_var', 'imputation_var', 'remove_outliers_var', 
                'remove_duplicates_var', 'encoder_var', 'feature_ing_var', 'delete_features_var', 'cut', 'color', 'clarity', 
                'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 'avg_girdle', 'table_mm', 'table_depth', 'xyz']

# Asignar las columnas al DataFrame
result_df = result_df.reindex(columns=column_names)

In [None]:
def automate_features_choice(df, pipelines, models, result_df):
    # Combinations
    cut         = [1]
    color       = [1]   # [0, 1]
    clarity     = [1]
    city        = [1]
    carat       = [1]
    depth       = [1]
    table       = [1]
    x           = [1]
    y           = [1]
    z           = [1]
    depth_mm    = [0]
    avg_girdle  = [0]
    table_mm    = [0]
    table_depth = [0]
    xyz         = [0]
    

    # Generar todas las combinaciones posibles
    combinations_features = itertools.product(cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm, 
                                              avg_girdle, table_mm, table_depth, xyz)

    #results = []
    for comb in combinations_features:
        #start_time = time.time()
        
        # Transform train data
        df_transform = version1_without_scaler2(df,
                                                drop_zeros_var=1,
                                                imputation_var=0,
                                                remove_outliers_var=1,
                                                remove_duplicates_var=1,
                                                feature_ing_var=1,
                                                delete_features_var=0)
        
        # Obtain the features to train the model
        cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm,\
            avg_girdle, table_mm, table_depth, xyz = comb
        
        # List of all possible feature names
        features_to_train = []
        all_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 
                       'avg_girdle', 'table_mm', 'table_depth', 'xyz']
        
        # Iterate over features and add them to variables_with_value_1 if they have a value of 1
        for feature in all_features:
            if locals()[feature] == 1:
                features_to_train.append(feature)

        # splitting the dataset in test and train data . The prece will be the Target and the other columns the features
        X = df_transform[features_to_train]
        y = df_transform['price']
        
        # There are combinations in which all values are zero, in that case it isn't necessary train model
        if df_transform.shape[1] >= 1:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
        
            # Fit our models to the training data
            for i in pipelines :
                i.fit(X_train , y_train)

            cv_results = []
            for i, model in enumerate(pipelines):
                # Cross validation
                cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
                cv_results.append(cv_score)

                # Test the result
                pred = model.predict(X_test)

                # Create list with featers used
                features_value_list = list(comb)
                
                # Create list with process parameters
                cv_score_mean = abs(np.mean(cv_results))
                rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
                process_list = [model, cv_score_mean, rmse, 1, 0, 1, 1, 1, 1, 0]
                
                # Create complete list
                registros = process_list + features_value_list
                
                # Store the results in the results dataframe
                result_df.loc[len(result_df.index)] = registros
                print(model, rmse, comb)
                
            #end_time = time.time()
            #print("Execution time: ", end_time - start_time)
    
    return result_df

In [None]:
%%time
pipelines = [rf_pipeline]   #[lr_pipeline, knn_pipline, dt_pipeline, rf_pipeline, xgb_pipeline]
models = ['Random Forest']   #['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGB']

# Execute functions and sort data
result_df = automate_features_choice(diamond_train_df, pipelines, models, result_df)
#play_sound()

pd.set_option('display.max_columns', None)
result_df_sorted = result_df.sort_values(by='prediction')
result_df_sorted.head(20)

#### Play sound

In [None]:
def play_sound():
    display(Javascript('new Audio("https://www.soundjay.com/button/beep-07.wav").play()'))

## Extraction data

#### Extraction data to process

In [None]:
diamond_train_df = pd.read_csv("../data/train/diamond_train_df_Nearest_all_features_3knn.csv")
diamond_train_df.head()

In [None]:
diamond_train_df.shape

In [12]:
# create a connection to a file called 'file.db'
con = duckdb.connect("../data/train/diamonds_train.db")

# Query to extract data from database
query_full = """
SELECT
    --tra.index_id,
    cut.cut,
    col.color,
    cla.clarity,
    tra.price,
    cit.city,
    tra.carat,
    dim.depth,
    dim.table,
    dim.x,
    dim.y,
    dim.z
FROM diamonds_properties AS pro
JOIN diamonds_cut AS cut ON pro.cut_id = cut.cut_id
JOIN diamonds_color AS col ON pro.color_id = col.color_id
JOIN diamonds_clarity AS cla ON pro.clarity_id = cla.clarity_id
JOIN diamonds_transactional as tra ON pro.index_id = tra.index_id
JOIN diamonds_city AS cit ON tra.city_id = cit.city_id
JOIN diamonds_dimensions AS dim ON pro.index_id = dim.index_id
"""

diamond_train_df = con.execute(query_full).df()
diamond_train_df.head()

Unnamed: 0,cut,color,clarity,price,city,carat,depth,table,x,y,z
0,Premium,J,VS2,4268,Dubai,1.21,62.4,58.0,6.83,6.79,4.25
1,Very Good,H,VS2,505,Kimberly,0.32,63.0,57.0,4.35,4.38,2.75
2,Fair,G,VS1,2686,Las Vegas,0.71,65.5,55.0,5.62,5.53,3.65
3,Good,D,SI1,738,Kimberly,0.41,63.8,56.0,4.68,4.72,3.0
4,Ideal,G,SI1,4882,Dubai,1.02,60.5,59.0,6.55,6.51,3.95


In [13]:
diamond_test_df = pd.read_csv("../data/test/diamonds_test.csv")
diamond_test_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


#### Extract the dataframe with the parameters of all tested models and configurations

In [14]:
parameters_df = pd.read_csv('./parameters_training/best_parameters_prediction_models.csv')
parameters_df.head()

Unnamed: 0,Model,cv_score,rmse,Submission,Features,Transformations,Estimators,Hyperparameters
0,Voting model,534.5,529.8,538,"['cut', 'color', 'clarity', 'city', 'carat', '...","['encoding', 'drop_zeros', 'remove_outliers', ...","['lgbm', 'xgb', 'extrees', 'rf']",{}
1,RF,563.6,541.8,597,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
2,RF,563.1,541.9,597,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
3,RF,568.2,545.7,0,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
4,RF,578.5,552.8,552,"['cut', 'color', 'clarity', 'carat_log', 'dept...","['encoding', 'imputation', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."


## Train and test models

#### Non-linear relation between features

In [None]:
# Observe the non-linear relation between features
X = transformed_df.copy()
y = X.pop("price")

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)
mi_scores

### Transform data

In [15]:
# All features
# selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'shape', 
#                       'x_log', 'y_log', 'z_log', 'carat_log', 'ratio_length_width', 'ratio_length_width_depth', 
#                       'volume', 'density', 'price']

# Transform
def transformation_data(df, type_data):
    trans_df = classify_shape(df)
    trans_df = encoder(trans_df)
    #trans_df = imputation(trans_df)

    if type_data == 'train_data':
        #trans_df = drop_zeros(trans_df)
        #trans_df = remove_outliers(trans_df)
        #trans_df = remove_duplicates(trans_df)
        selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'shape', 'x', 'y', 'z', 'price']
        #selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table','x', 'y', 'z', 'price']
    
    if type_data == 'test_data':
        selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'shape', 'x', 'y', 'z']     
        #selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z']
        
    trans_df = feature_ing(trans_df)
    
    # Calculate others features
    trans_df = calculate_log(trans_df, 'carat')
    trans_df = calculate_log(trans_df, 'x')
    trans_df = calculate_log(trans_df, 'y')
    trans_df = calculate_log(trans_df, 'z')
    trans_df['ratio_length_width'] = trans_df['x']/trans_df['y']
    trans_df['ratio_length_width_depth'] = trans_df['x']/trans_df['y']/trans_df['z']
    trans_df['volume'] = trans_df['x']*trans_df['y']*trans_df['z']
    trans_df['density'] = trans_df['carat']/trans_df['volume']
    
    # Only used selection features
    trans_df_2 = trans_df[selection_features]
    trans_df_2.head()
    
    return trans_df_2, selection_features

#Transformations:
transformations = ['encoding', 'drop_zeros'] #, 'remove_outliers', 'remove_duplicates']

transformed_df, selection_features = transformation_data(diamond_train_df, 'train_data')
X = transformed_df.drop('price',axis = 1)
y = transformed_df['price']
X.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,shape,x,y,z
0,3,6,5,2,1.21,62.4,1,6.83,6.79,4.25
1,4,4,5,3,0.32,63.0,1,4.35,4.38,2.75
2,0,3,4,4,0.71,65.5,1,5.62,5.53,3.65
3,1,0,2,3,0.41,63.8,1,4.68,4.72,3.0
4,2,3,2,2,1.02,60.5,1,6.55,6.51,3.95


In [16]:
#X.to_csv('../data/processed/x_data.csv', index=False)
#y.to_csv('../data/processed/y_data.csv', index=False)

In [None]:
# Obtain data for training and testing. It is necessary to modify the pipeline to check this because the input to the 
# models is the complete dataset, in order to upload it to kaggle.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

### Fine-Tuning models

#### XGBoost

In [None]:
param_grid_xgb = {'n_estimators': [950],  # Number of trees in the forest.
                  'max_depth': [5],  # Maximum depth of the trees.
                  'subsample': [1],
                  'colsample_bytree': [0.8],
                  'lambda': [0.7],
                  'gamma': [0.05],
                  'learning_rate': [0.035]
                 }

xgb_reg = XGBRegressor(random_state=0)

xgb_grid_search = GridSearchCV(xgb_reg, param_grid_xgb, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

xgb_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', xgb_grid_search.best_params_, '\n')
print('Best score: ', xgb_grid_search.best_score_, '\n')

#### LGBM Regressor

In [None]:
param_grid_lgbm = {'num_leaves': [30], 
                   'learning_rate': [0.02] ,
                   'n_estimators': [1300],
                   'max_depth': [-1]
                  }

lgbm_reg = LGBMRegressor(random_state=0)

lgbm_grid_search = GridSearchCV(param_grid_lgbm, param_grid, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

lgbm_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', lgbm_grid_search.best_params_, '\n')
print('Best score: ', lgbm_grid_search.best_score_, '\n')

#### ExtraTreesRegressor

In [None]:
param_grid = {'n_estimators': [70, 100, 150], 
              'max_depth': [None],
              'criterion': ['squared_error'],
              'max_features': ['sqrt'],
              'min_samples_leaf': [1, 2],
              'min_samples_split': [3, 4]
             }

extrees_reg = ExtraTreesRegressor(random_state=0)

extrees_grid_search = GridSearchCV(extrees_reg, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

extrees_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', extrees_grid_search.best_params_, '\n')
print('Best score: ', extrees_grid_search.best_score_, '\n')

#### RandomForestRegressor

In [None]:
%%time
param_grid_rf = {'n_estimators': [1400],  # Number of trees in the forest.
                 'max_depth': [None],  # Maximum depth of the trees.
                 'min_samples_split': [7],  # Minimum number of samples required to split an internal node.
                 'min_samples_leaf': [1],  # Minimum number of samples required to be at a leaf node.
                 'max_features': [None]  # Number of features to consider when looking for the best split.
                }

rf_reg = RandomForestRegressor(random_state=0)

rf_grid_search = GridSearchCV(rf_reg, param_grid_rf, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

rf_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', rf_grid_search.best_params_, '\n')
print('Best score: ', rf_grid_search.best_score_, '\n')

#### MLPRegressor

In [None]:
%%time
# Define the model
mlp = MLPRegressor(max_iter=100)

# Set up the parameter grid
param_grid_mlp = {'hidden_layer_sizes': [(50,100,50)],
                  'activation': ['relu'],
                  'solver': ['adam'],
                  'alpha': [0.05],
                 }

# Instantiate GridSearchCV
mlp_grid_search = GridSearchCV(mlp, param_grid_mlp, cv=5)

# Fit the GridSearchCV object
mlp_grid_search.fit(X, y)

# Access the results
print('\n')
print('Best hyperparameters: ', mlp_grid_search.best_params_, '\n')
print('Best score: ', mlp_grid_search.best_score_, '\n')

### Train model

#### Estimators

In [None]:
param_grid_xgb = {'n_estimators': 950,  # Number of trees in the forest.
                  'max_depth': 5,  # Maximum depth of the trees.
                  'subsample': 1,
                  'colsample_bytree': 0.8,
                  'lambda': 0.7,
                  'gamma': 0.05,
                  'learning_rate': 0.035
                 }
param_grid_lgbm = {'num_leaves': 30, 
                   'learning_rate': 0.02 ,
                   'n_estimators': 1300,
                   'max_depth': -1
                  }
param_grid_rf = {'n_estimators': 1400,  # Number of trees in the forest.
                 'max_depth': None,  # Maximum depth of the trees.
                 'min_samples_split': 7,  # Minimum number of samples required to split an internal node.
                 'min_samples_leaf': 1,  # Minimum number of samples required to be at a leaf node.
                 'max_features': None  # Number of features to consider when looking for the best split.
                }
param_grid_mlp = {'hidden_layer_sizes': (50,100,50),
                  'activation': 'relu',
                  'solver': 'adam',
                  'alpha': 0.05,
                 }

#### Stacking model

In [None]:
%%time
xgb_model = XGBRegressor(**param_grid_xgb)
lgb_model = LGBMRegressor(**param_grid_lgbm)
rf_model = RandomForestRegressor(**param_grid_rf)
mlp_model = MLPRegressor(**param_grid_mlp)

estimators = [('xgb1', xgb_model),
              ('lgbm1', lgb_model),
              ('rf', rf_model),
              ('mlp', mlp_model)]
              #('extrees', extrees_model),]

# Train the model
stack_model = StackingRegressor(estimators=estimators, cv=5, n_jobs=-1, verbose=True, passthrough=True)   #cv=None

# Execute cross validation
cv_results = []
cv_score = cross_val_score(stack_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)
cv_score_mean = abs(np.mean(cv_results))

# Fit the model
stack_model.fit(X,y)

# Print hyperparameters and cv_score
hyperparameters = stack_model.get_params()
cv_score_mean = abs(np.mean(cv_results))
print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)

# Save the parameters of the training performed and the score in the dataframe to keep track of all tests performed 
registers = [model_type, cv_score_mean, rmse, 0, selection_features, transformations, estimators, hyperparameters]
parameters_df.loc[len(parameters_df.index)] = registers
parameters_df.to_csv('./parameters_training/best_parameters_prediction_models.csv', index=False)

### Transform test data and obtain the prediction to upload in kaggle

In [17]:
# Transform the prediction data
X_test_all, selection_features2 = transformation_data(diamond_test_df, 'test_data')
X_test_all.shape
X_test_all.to_csv('../data/processed/x_test.csv', index=False)

In [18]:
%%time
# Predict
y_pred_all = model_all.predict(X_test_all)
len(y_pred_all)

CPU times: total: 562 ms
Wall time: 87.1 ms


13485

In [19]:
# Create and store the dataframe to upload to kaggle
y_pred_df = pd.DataFrame(y_pred_all)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('../data/submisions/XGB_all_data.csv', index=False)