# PIPELINE AND PRODUCTION CODE
# Imports

In [26]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import itertools
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import umap

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')

# Functions
### Functions to correct errors

In [3]:
# Function to drop zeros
def drop_zeros(df):
    df = df.drop(df[df['x'] == 0].index)
    df = df.drop(df[df['y'] == 0].index)
    df = df.drop(df[df['z'] == 0].index)
    return df

"""
# Function to remove outliers
def remove_outliers(df):
    df = df[(df['x'] < 30)]
    df = df[(df['y'] < 30)]
    df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    df = df[(df['table'] < 80) & (df['table'] > 40)]
    df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df
"""

def remove_outliers(df):
    if 'x' in df.columns:
        df = df[df['x'] < 30]
    if 'y' in df.columns:
        df = df[df['y'] < 30]
    if 'z' in df.columns:
        df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    if 'table' in df.columns:
        df = df[(df['table'] < 80) & (df['table'] > 40)]
    if 'depth' in df.columns:
        df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df

# Function to remove duplicates
def remove_duplicates(df):
    df = df.drop_duplicates()
    return df

# Function to impute values:
def imputation(df):
    # Calculate the median of each column
    median_x = df.loc[df['x'] != 0, 'x'].median()
    median_y = df.loc[df['y'] != 0, 'y'].median()
    median_z = df.loc[df['z'] != 0, 'z'].median()

    # Replace values equal to 0 by the corresponding median.
    df['x'] = df['x'].replace(0, median_x)
    df['y'] = df['y'].replace(0, median_y)
    df['z'] = df['z'].replace(0, median_z)
    return df

### Functions to encode

In [4]:
def encoder(df):
    df_enc = df.copy()

    # Obtain the dataframe encoded
    for column in df.columns:
        if df[column].dtype == 'object':
            enc_label = LabelEncoder()
            df_enc[column] = enc_label.fit_transform(df[column])
    return df_enc

### Functions to features ingeniering

In [5]:
def feature_ing(df_features):
    #print('Dataframe features: ',df_features.head())
    # Test the depth calculate
    df_features['depth_mm'] = (df_features['z']*2)/(df_features['x'] + df_features['y'])
    # Obtain the average girdle diameter
    df_features['avg_girdle'] = (df_features['z'])/(df_features['depth_mm'])
    # Obtain table in mm
    df_features['table_mm'] = (df_features['avg_girdle'])*(df_features['table'])/100
    # Obtain table*depth
    df_features['table_depth'] = (df_features['table'])/(df_features['depth'])
    # Obtain x, y, z
    df_features['xyz'] = (df_features['x'])*(df_features['y'])*(df_features['z'])
    return df_features

### Functions to remove uncorrelated features

In [6]:
# Function to delete features without correlation with price (train data)
def delete_features_train(df):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Find features with correlation greater than 0.90
    to_drop = corr_matrix.columns[corr_matrix['price'] <= 0.1]
    #print(to_drop)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct,to_drop

# Function to delete features without correlation with price (test data)
def delete_features_test(df, to_drop):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct

### Functions to scale data

In [7]:
# Scaling function
def stardard_scale_train(df):
    # Split features and target
    X = df.drop(columns=['price'])  # Features
    y = df['price']  # Target
    columns = X.columns
    # Scaler
    scaler = StandardScaler()
    # Scale X
    X_scaled = scaler.fit_transform(X)
    # Scale y. With reshape function, we are converting y_v1 into a single column matrix, i.e., one column and as many rows 
    # as it originally had elements. With flatten, this flattens the resulting array.
    y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
    # Return
    return X_scaled,y_scaled,columns

# Scaling function
def stardard_scale_test(df):
    X = df
    columns = X.columns
    # Scaler
    scaler = StandardScaler()
    # Scale X
    X_scaled = scaler.fit_transform(X)
    return X_scaled,columns

# Invert the scaling
def inverse_scaler(df):
    return scal.inverse_transform(df.reshape(-1, 1)).flatten()

# Imports
### Train data
The query use this terminology:
- pro = diamonds_properties 
- cut = diamonds_cut 
- col = diamonds_color
- cla = diamonds_clarity
- tra = diamonds_transactional
- cit = diamonds_city
- dim = diamonds_dimensions

In [8]:
# create a connection to a file called 'file.db'
con = duckdb.connect("../data/train/diamonds_train.db")

# Query to extract data from database
query_full = """
SELECT
    --tra.index_id,
    cut.cut,
    col.color,
    cla.clarity,
    tra.price,
    cit.city,
    tra.carat,
    dim.depth,
    dim.table,
    dim.x,
    dim.y,
    dim.z
FROM diamonds_properties AS pro
JOIN diamonds_cut AS cut ON pro.cut_id = cut.cut_id
JOIN diamonds_color AS col ON pro.color_id = col.color_id
JOIN diamonds_clarity AS cla ON pro.clarity_id = cla.clarity_id
JOIN diamonds_transactional as tra ON pro.index_id = tra.index_id
JOIN diamonds_city AS cit ON tra.city_id = cit.city_id
JOIN diamonds_dimensions AS dim ON pro.index_id = dim.index_id
"""

diamond_train_df = con.execute(query_full).df()
diamond_train_df.head()

Unnamed: 0,cut,color,clarity,price,city,carat,depth,table,x,y,z
0,Premium,J,VS2,4268,Dubai,1.21,62.4,58.0,6.83,6.79,4.25
1,Very Good,H,VS2,505,Kimberly,0.32,63.0,57.0,4.35,4.38,2.75
2,Fair,G,VS1,2686,Las Vegas,0.71,65.5,55.0,5.62,5.53,3.65
3,Good,D,SI1,738,Kimberly,0.41,63.8,56.0,4.68,4.72,3.0
4,Ideal,G,SI1,4882,Dubai,1.02,60.5,59.0,6.55,6.51,3.95


In [9]:
diamond_test_df = pd.read_csv("../data/test/diamonds_test.csv")
diamond_test_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


# Transformation: version 1

In [10]:
def version1(df, data_type, to_drop=[]):
    # Transformations
    df = drop_zeros(df)   # Drop zeros
    df = remove_outliers(df)   # Remove outliers
    df = remove_duplicates(df)   # Remove duplicates
    df = encoder(df)   # Encoding
    df = feature_ing(df)   # Feature ingeniering
    
    # In case of test data, the function needs drops_columns to remonve the same data
    if data_type == 'train':
        # Drop features
        df, to_drop = delete_features_train(df)
        # Scale   
        X_scaled, y_scaled, columns = stardard_scale_train(df)
        # Numpy array to dataframe
        X_scaled_df = pd.DataFrame(X_scaled, columns=columns)
        y_scaled_df = pd.DataFrame({'price': y_scaled})
        # return
        return X_scaled_df, y_scaled_df, to_drop
    
    elif data_type == 'test':
        # Drop features
        df = delete_features_test(df, to_drop)
        # Scale
        X_scaled, columns = stardard_scale_test(df)
        # Numpy array to dataframe
        X_scaled_df = pd.DataFrame(X_scaled, columns=columns)
        # return
        return X_scaled_df

# Transformation: version 1 (without scaler)

In [11]:
def version1_without_scaler(df, data_type, to_drop=[]):
    # Transformations
    df = drop_zeros(df)   # Drop zeros
    df = remove_outliers(df)   # Remove outliers
    df = remove_duplicates(df)   # Remove duplicates
    df = encoder(df)   # Encoding
    df = feature_ing(df)   # Feature ingeniering
    
    # In case of test data, the function needs drops_columns to remonve the same data
    if data_type == 'train':
        # Drop features
        df, to_drop = delete_features_train(df)
        # return
        return df, to_drop
    
    elif data_type == 'test':
        # Drop features
        df = delete_features_test(df, to_drop)
        # return
        return df

# Transformation: version 2 (without scaler)

In [12]:
def version1_without_scaler(df, 
                            drop_zeros_var=0, 
                            imputation_var=0, 
                            remove_outliers_var=0, 
                            remove_duplicates_var=0,
                            encoder_var=1,
                            feature_ing_var=0,
                            delete_features_var=0):
    # Transformations
    if drop_zeros_var == 1:
        df = drop_zeros(df)   # Drop zeros
    if imputation_var ==1:
        df = imputation(df)
    if remove_outliers_var == 1:
        df = remove_outliers(df)   # Remove outliers
    if remove_duplicates_var == 1:
        df = remove_duplicates(df)   # Remove duplicates
    if encoder_var == 1:
        df = encoder(df)   # Encoding
    if feature_ing_var == 1:
        df = feature_ing(df)   # Feature ingeniering
    if delete_features_var == 1:
        df, to_drop = delete_features_train(df) # Drop features
        print(to_drop)

    return df

In [None]:
# Transform train data
df_transform = version1_without_scaler(diamond_train_df)
print('Train dataframe shape: ', diamond_train_df.shape, ' | X train shpape: ', df_transform.shape, ' | Difference: ', 
     diamond_train_df.shape[0]-df_transform.shape[0])

# Pipeline

### Use only the complete dataset where I know the price column (with scaling in this step)

In [13]:
# Building the Pipelines

#Linear Regression
lr_pipeline = Pipeline([
    ('scaler_1',StandardScaler()),
    ('lr_classifier',LinearRegression())
])
# knn
knn_pipline =Pipeline([
    ('scaler_2' ,StandardScaler()),
    ('knn_classifier',KNeighborsRegressor())
])

#XGB
xgb_pipeline = Pipeline([
    ('scaler_3', StandardScaler()),
    ('xgb_classifier', XGBRegressor())
])


#Decision Tree
dt_pipeline = Pipeline([
    ('scaler_4', StandardScaler()),
    ('dt_classifier', DecisionTreeRegressor())
])

#Random Forest
rf_pipeline = Pipeline([
    ('scaler_5', StandardScaler()),
    ('rf_classifier', RandomForestRegressor())
])

#pipelines = [lr_pipeline,knn_pipline,dt_pipeline,rf_pipeline]   #,xgb_pipeline
#models = ['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest']   #, 'XGB'

pipelines = [rf_pipeline, xgb_pipeline]
models = ['Random Forest', 'XGB']

# Automate the process

In [14]:
def version1_without_scaler2(df, drop_zeros_var, imputation_var, remove_outliers_var, remove_duplicates_var,
                             feature_ing_var, delete_features_var, encoder_var=1):
    # Transformations
    if drop_zeros_var == 1:
        df = drop_zeros(df)   # Drop zeros
        
    if imputation_var ==1:
        df = imputation(df)   # Imputation data
        
    if remove_outliers_var == 1:
        df = remove_outliers(df)   # Remove outliers
        
    if remove_duplicates_var == 1:
        df = remove_duplicates(df)   # Remove duplicates
        
    if encoder_var == 1:
        df = encoder(df)   # Encoding
        
    if feature_ing_var == 1:
        df = feature_ing(df)   # Feature ingeniering
        
    if delete_features_var == 1:
        df, to_drop = delete_features_train(df)   # Drop features
        #print(to_drop)
        
    return df

### Diferent processing combination

In [15]:
# Lista de valores para las variables excepto df
drop_zeros_var_vals       = [0, 1]   # [0, 1]
imputation_var_vals       = [0, 1]
remove_outliers_var_vals  = [0, 1]
remove_duplicates_var_vals= [0, 1]
feature_ing_var_vals      = [0]
delete_features_var_vals  = [0]

# Generar todas las combinaciones posibles
combinations = itertools.product(drop_zeros_var_vals,
                                  imputation_var_vals,
                                  remove_outliers_var_vals,
                                  remove_duplicates_var_vals,
                                  feature_ing_var_vals,
                                  delete_features_var_vals)

In [None]:
def automate_process_choice(df, combinations, pipelines, models):
    results = []
    for comb in combinations:
        # Combinations
        drop_zeros_var, imputation_var, remove_outliers_var, remove_duplicates_var, feature_ing_var, delete_features_var = comb
        
        # Transform train data
        df_transform = version1_without_scaler2(df,
                                                drop_zeros_var,
                                                imputation_var,
                                                remove_outliers_var,
                                                remove_duplicates_var,
                                                feature_ing_var,
                                                delete_features_var)

        # splitting the dataset in test and train data . The prece will be the Target and the other columns the features
        X = df_transform.drop('price',axis = 1)
        y = df_transform['price']
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
        
        # Fit our models to the training data
        for i in pipelines :
            i.fit(X_train , y_train)
        
        encoder_var = 1
        cv_results = []
        for i, model in enumerate(pipelines):
            # Cross validation
            cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
            cv_results.append(cv_score)

            # Test the result
            pred = model.predict(X_test)
            
            # Store the results in the results list
            results.append({'model': models[i],
                            'cv_score': cv_score.mean(),
                            'prediction': np.sqrt(metrics.mean_squared_error(y_test, pred)),
                            'drop_zeros_var': drop_zeros_var, 
                            'imputation_var': imputation_var, 
                            'remove_outliers_var': remove_outliers_var, 
                            'remove_duplicates_var': remove_duplicates_var, 
                            'encoder_var': encoder_var, 
                            'feature_ing_var': feature_ing_var, 
                            'delete_features_var': delete_features_var})
            
            print(comb, model, np.sqrt(metrics.mean_squared_error(y_test, pred)))

    results_df = pd.DataFrame(results)
    return results_df

In [None]:
%%time

# Parameters
pipelines = [rf_pipeline]   #[lr_pipeline, knn_pipline, dt_pipeline, rf_pipeline, xgb_pipeline]
models = ['Random Forest']   #['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGB']

# Execute functions and sort data
result_df = automate_process_choice(diamond_train_df, combinations, pipelines, models)
result_df_sorted = result_df.sort_values(by='prediction')
result_df_sorted.head(20)

In [None]:
prediction_result1 = result_df_sorted.head(20)
prediction_result1.to_csv('prediction_12-02-24')
#prediction_result1

### Diferent features combination

In [20]:
def automate_features_choice(df, pipelines, models):
    # Features options
    cut         = [0]
    color       = [0]   # [0, 1]
    clarity     = [0]
    city        = [0]
    carat       = [1]
    depth       = [0]
    table       = [1]
    x           = [1]
    y           = [1]
    z           = [1]
    depth_mm    = [1]
    avg_girdle  = [1]
    table_mm    = [1]
    table_depth = [1]
    xyz         = [0]
    

    # Generar todas las combinaciones posibles
    combinations_features = itertools.product(cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm, 
                                              avg_girdle, table_mm, table_depth, xyz)
    
    results = []
    for comb in combinations_features:
        # Transform train data
        df_transform = version1_without_scaler2(df,
                                                drop_zeros_var=0,
                                                imputation_var=0,
                                                remove_outliers_var=1,
                                                remove_duplicates_var=1,
                                                feature_ing_var=1,
                                                delete_features_var=0)
        
        # Obtain the features to train the model
        cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm,\
            avg_girdle, table_mm, table_depth, xyz = comb
        
        variables_con_valor_1 = []
        if cut == 1:
            variables_con_valor_1.append('cut')
        if color == 1:
            variables_con_valor_1.append('color')
        if clarity == 1:
            variables_con_valor_1.append('clarity')
        if city == 1:
            variables_con_valor_1.append('city')
        if carat == 1:
            variables_con_valor_1.append('carat')
        if depth == 1:
            variables_con_valor_1.append('depth')
        if table == 1:
            variables_con_valor_1.append('table')
        if x == 1:
            variables_con_valor_1.append('x')
        if y == 1:
            variables_con_valor_1.append('y')
        if z == 1:
            variables_con_valor_1.append('z')
        if depth_mm == 1:
            variables_con_valor_1.append('depth_mm')
        if avg_girdle == 1:
            variables_con_valor_1.append('avg_girdle')
        if table_mm == 1:
            variables_con_valor_1.append('table_mm')
        if table_depth == 1:
            variables_con_valor_1.append('table_depth')
        if xyz == 1:
            variables_con_valor_1.append('xyz')
            
        # Obtain the price column
        price = df_transform['price']
        
        # Select the features
        selected_f_df = df_transform[variables_con_valor_1]
        
        # Fix the features dataframe and price colunm
        new_df = pd.concat([selected_f_df, price], axis=1)

        # splitting the dataset in test and train data . The prece will be the Target and the other columns the features
        X = new_df.drop('price',axis = 1)
        y = new_df['price']
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
        
        # There are combinations in which all values are zero, in that case it isn't necessary train model
        if new_df.shape[1] > 1:
            # Fit our models to the training data
            for i in pipelines :
                i.fit(X_train , y_train)

            cv_results = []
            for i, model in enumerate(pipelines):
                # Cross validation
                cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
                cv_results.append(cv_score)

                # Test the result
                pred = model.predict(X_test)

                # Store the results in the results list
                results.append({'model': models[i],
                                'cv_score': cv_score.mean(),
                                'prediction': np.sqrt(metrics.mean_squared_error(y_test, pred)),
                                'drop_zeros_var': 0, 
                                'imputation_var': 0, 
                                'remove_outliers_var': 1, 
                                'remove_duplicates_var': 1, 
                                'encoder_var': 1, 
                                'feature_ing_var': 1,   # This is necessary to test which feature combination is the best
                                'delete_features_var': 0,
                                'color':color,
                                'carat':carat,
                                'table':table,
                                'x':x,
                                'y':y,
                                'z':z,
                                'depth_mm':depth_mm,
                                'avg_girdle':avg_girdle,
                                'table_mm':table_mm,
                                'table_depth':table_depth,
                                'xyz':xyz})
                print(comb, model, np.sqrt(metrics.mean_squared_error(y_test, pred)))

    results_df = pd.DataFrame(results)
    return results_df

In [21]:
# Parameters
pipelines = [rf_pipeline]   #[lr_pipeline, knn_pipline, dt_pipeline, rf_pipeline, xgb_pipeline]
models = ['Random Forest']   #['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGB']

# Execute functions and sort data
result_df_features = automate_features_choice(diamond_train_df, pipelines, models)
result_df_features_sorted = result_df_features.sort_values(by='prediction')
result_df_features_sorted.head(10)

(0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0) Pipeline(steps=[('scaler_5', StandardScaler()),
                ('rf_classifier', RandomForestRegressor())]) 1360.6705881365433


Unnamed: 0,model,cv_score,prediction,drop_zeros_var,imputation_var,remove_outliers_var,remove_duplicates_var,encoder_var,feature_ing_var,delete_features_var,...,carat,table,x,y,z,depth_mm,avg_girdle,table_mm,table_depth,xyz
0,Random Forest,-1381.38826,1360.670588,0,0,1,1,1,1,0,...,1,1,1,0 4268 1 505 2 2686 3 ...,1,1,1,1,1,0


In [27]:
pd.set_option('display.max_columns', None)
result_df_features_sorted

Unnamed: 0,model,cv_score,prediction,drop_zeros_var,imputation_var,remove_outliers_var,remove_duplicates_var,encoder_var,feature_ing_var,delete_features_var,color,carat,table,x,y,z,depth_mm,avg_girdle,table_mm,table_depth,xyz
0,Random Forest,-1381.38826,1360.670588,0,0,1,1,1,1,0,0,1,1,1,0 4268 1 505 2 2686 3 ...,1,1,1,1,1,0


In [25]:
cut         = [1]
color       = [1]   # [0, 1]
clarity     = [1]
city        = [1]
carat       = [1]
depth       = [1]
table       = [1]
x           = [1]
y           = [1]
z           = [1]
depth_mm    = [0, 1]
avg_girdle  = [0, 1]
table_mm    = [0, 1]
table_depth = [0, 1]
xyz         = [0, 1]
    

# Generar todas las combinaciones posibles
combinations_features = itertools.product(cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm, 
                                          avg_girdle, table_mm, table_depth, xyz)
features_list = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 'avg_girdle', 
                 'table_mm', 'table_depth', 'xyz']

for comb in combinations_features:
    print(comb)

(0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0)


# Optimaze the model
### Obtain the data with the best combination

In [None]:
df_transform = version1_without_scaler2(diamond_train_df,
                                        drop_zeros_var=1,
                                        imputation_var=0,
                                        remove_outliers_var=1,
                                        remove_duplicates_var=1,
                                        feature_ing_var=0,
                                        delete_features_var=0)

# splitting the dataset in test and train data . The prece will be the Target and the other columns the features
X = df_transform.drop('price',axis = 1)
y = df_transform['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
df_transform.shape

### Obtain the best parameters of the model
#### Gradient busting regresor

In [None]:
param_grid = {'n_estimators': [100, 250, 500, 750, 1000],  # Number of boosting stages to be run.
              'learning_rate': [0.01, 0.05, 0.1],  # Rate at which the contribution of each tree is shrunk.
              'max_depth': [None, 3, 6, 10],  # Maximum depth of the individual regression estimators.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # The number of features to consider when looking for the best split.
              }

In [None]:
# Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state = 42)

param_grid = {'n_estimators': [400, 600],  # Number of boosting stages to be run.
              'learning_rate': [0.01],  # Rate at which the contribution of each tree is shrunk.
              'max_depth': [None],  # Maximum depth of the individual regression estimators.
              'min_samples_split': [2],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [4, 6],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # The number of features to consider when looking for the best split.
              }

grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [None]:
%%time

# Obtain the best parameters
grid_search.fit(X_train, y_train)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

In [None]:
# Random Forest Regressor
param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

In [None]:
%%time

# Obtain the best parameters
grid_search.fit(X_train, y_train)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

# Final process

In [None]:
results = []
drop_zeros_var=1,
imputation_var=0,
remove_outliers_var=1,
remove_duplicates_var=1,
feature_ing_var=0,
delete_features_var=0
encoder_var = 1

df_transform = version1_without_scaler2(diamond_train_df,
                                        drop_zeros_var,
                                        imputation_var,
                                        remove_outliers_var,
                                        remove_duplicates_var,
                                        feature_ing_var,
                                        delete_features_var)

In [None]:
# Configuration Pipeline
pipelines = [rf_pipeline]
models = ['Random Forest']

# splitting the dataset in test and train data . The prece will be the Target and the other columns the features
X = df_transform.drop('price',axis = 1)
y = df_transform['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
#print(X_train.head())
        
# Fit our models to the training data
for i in pipelines :
    i.fit(X_train , y_train)
#print(X_train.head())
        
cv_results = []
for i, model in enumerate(pipelines):
    # Cross validation
    cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
    cv_results.append(cv_score)
    #print("%s: %f " % (models[i], cv_score.mean()))

    # Test the result
    pred = model.predict(X_test)
    #print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, pred)))

    # Store the results in the results list
    results.append({'drop_zeros_var': drop_zeros_var, 
                    'imputation_var': imputation_var, 
                    'remove_outliers_var': remove_outliers_var, 
                    'remove_duplicates_var': remove_duplicates_var, 
                    'encoder_var': encoder_var, 
                    'feature_ing_var': feature_ing_var, 
                    'delete_features_var': delete_features_var,
                    f'{models[i]}': cv_score.mean(),
                    'prediction': np.sqrt(metrics.mean_squared_error(y_test, pred))})
results_df = pd.DataFrame(results)

In [None]:
results_df

# Comment

In [None]:
"""
names = ["Linear Regression", "Ridge Regression", "Lasso Regression",
         "Decision Tree Regressor", "Random Forest Regressor", "Gradient Boosting Regressor",
         "Adaboost Regressor", "BaggingRegressor", "ExtraTreesRegressor","XGBRegressor", "XGBRFRegressor"]
models = [LinearRegression(), Ridge(), Lasso(), DecisionTreeRegressor(),
          RandomForestRegressor(), GradientBoostingRegressor(), 
          AdaBoostRegressor(), BaggingRegressor(), ExtraTreesRegressor(),XGBRegressor(), XGBRFRegressor()]
          
names = ["ExtraTreesRegressor", "Random Forest Regressor"]
models = [ExtraTreesRegressor(), RandomForestRegressor()]
"""