## Imports

In [1]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import time
from IPython.display import display, Javascript
import itertools
import duckdb
import pandas as pd
import numpy as np
import math

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

# Data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Pipeline
from sklearn.pipeline import Pipeline

# Train
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Models
import umap
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')

## Functions

### Functions to correct errors

In [2]:
# Function to drop zeros
def drop_zeros(df):
    df = df.drop(df[df['x'] == 0].index)
    df = df.drop(df[df['y'] == 0].index)
    df = df.drop(df[df['z'] == 0].index)
    return df

"""
# Function to remove outliers
def remove_outliers(df):
    df = df[(df['x'] < 30)]
    df = df[(df['y'] < 30)]
    df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    df = df[(df['table'] < 80) & (df['table'] > 40)]
    df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df
"""

def remove_outliers(df):
    if 'x' in df.columns:
        df = df[df['x'] < 20]
    if 'y' in df.columns:
        df = df[df['y'] < 20]
    if 'z' in df.columns:
        df = df[(df['z'] < 7.5) & (df['z'] > 2)]
    if 'table' in df.columns:
        df = df[(df['table'] < 80) & (df['table'] > 40)]
    if 'depth' in df.columns:
        df = df[(df['depth'] < 75) & (df['depth'] > 45)]
    return df

# Function to remove duplicates
def remove_duplicates(df):
    df = df.drop_duplicates()
    return df

# Function to impute values:
def imputation(df):
    # Calculate the median of each column
    median_x = df.loc[df['x'] != 0, 'x'].median()
    median_y = df.loc[df['y'] != 0, 'y'].median()
    median_z = df.loc[df['z'] != 0, 'z'].median()

    # Replace values equal to 0 by the corresponding median.
    df['x'] = df['x'].replace(0, median_x)
    df['y'] = df['y'].replace(0, median_y)
    df['z'] = df['z'].replace(0, median_z)
    return df

### Functions to encode

In [3]:
def encoder(df):
    df_enc = df.copy()

    # Obtain the dataframe encoded
    for column in df.columns:
        if df[column].dtype == 'object':
            enc_label = LabelEncoder()
            df_enc[column] = enc_label.fit_transform(df[column])
    return df_enc

### Functions to features ingeniering

In [4]:
def feature_ing(df_features):
    #print('Dataframe features: ',df_features.head())
    # Test the depth calculate
    df_features['depth_mm'] = (df_features['z']*2)/(df_features['x'] + df_features['y'])
    # Obtain the average girdle diameter
    df_features['avg_girdle'] = (df_features['z'])/(df_features['depth_mm'])
    # Obtain table in mm
    df_features['table_mm'] = (df_features['avg_girdle'])*(df_features['table'])/100
    # Obtain table*depth
    df_features['table_depth'] = (df_features['table'])/(df_features['depth'])
    # Obtain x, y, z
    df_features['xyz'] = (df_features['x'])*(df_features['y'])*(df_features['z'])
    return df_features

### Functions to remove uncorrelated features

In [5]:
# Function to delete features without correlation with price (train data)
def delete_features_train(df):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Find features with correlation greater than 0.90
    to_drop = corr_matrix.columns[corr_matrix['price'] <= 0.1]
    #print(to_drop)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct,to_drop

# Function to delete features without correlation with price (test data)
def delete_features_test(df, to_drop):
    # Calculate correlation matrix, round with two decimmals
    corr_matrix = round(df.corr(numeric_only=True).abs(),2)
    
    # Plot the heatmap
    #sns.set (rc = {'figure.figsize':(16, 6)})
    #sns.heatmap(corr_matrix, center=0, cmap='BrBG', annot=True)

    # Drop features
    df_correct = df
    df_correct.drop(to_drop, axis=1, inplace=True)
    
    return df_correct

### Scaling function

In [6]:
def stardard_scale_test(df):
    X = df.drop('price',axis = 1)
    columns = X.columns
    # Scaler
    scaler = StandardScaler()
    # Scale X
    X_scaled = scaler.fit_transform(X)
    return X_scaled

### Pipeline function

In [7]:
# Building the Pipelines

#Linear Regression
lr_pipeline = Pipeline([
    ('scaler_1',StandardScaler()),
    ('lr_classifier',LinearRegression())
])
# knn
knn_pipline =Pipeline([
    ('scaler_2' ,StandardScaler()),
    ('knn_classifier',KNeighborsRegressor())
])

#XGB
xgb_pipeline = Pipeline([
    ('scaler_3', StandardScaler()),
    ('xgb_classifier', XGBRegressor())
])


#Decision Tree
dt_pipeline = Pipeline([
    ('scaler_4', StandardScaler()),
    ('dt_classifier', DecisionTreeRegressor())
])

#Random Forest
rf_pipeline = Pipeline([
    ('scaler_5', StandardScaler()),
    ('rf_classifier', RandomForestRegressor(random_state = 42,
                                            n_jobs=-1))
])

#pipelines = [lr_pipeline,knn_pipline,dt_pipeline,rf_pipeline]   #,xgb_pipeline
#models = ['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest']   #, 'XGB'

pipelines = [rf_pipeline, xgb_pipeline]
models = ['Random Forest', 'XGB']

### Automation process

In [8]:
def version1_without_scaler2(df, drop_zeros_var, imputation_var, remove_outliers_var, remove_duplicates_var,
                             feature_ing_var, delete_features_var, encoder_var=1):
    # Transformations
    if drop_zeros_var == 1:
        df = drop_zeros(df)   # Drop zeros
        
    if imputation_var ==1:
        df = imputation(df)   # Imputation data
        
    if remove_outliers_var == 1:
        df = remove_outliers(df)   # Remove outliers
        
    if remove_duplicates_var == 1:
        df = remove_duplicates(df)   # Remove duplicates
        
    if encoder_var == 1:
        df = encoder(df)   # Encoding
        
    if feature_ing_var == 1:
        df = feature_ing(df)   # Feature ingeniering
        
    if delete_features_var == 1:
        df, to_drop = delete_features_train(df)   # Drop features
        #print(to_drop)
        
    return df

### Play sound

In [65]:
def play_sound():
    display(Javascript('new Audio("https://www.soundjay.com/button/beep-07.wav").play()'))

## Extraction data

In [34]:
diamond_train_df = pd.read_csv("../data/train/diamond_train_df_Nearest_all_features_1knn_umap.csv")
diamond_train_df.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z,price
0,4,1,3,1,1.01,63.1,61.0,6.34,6.27,3.98,4118
1,2,6,4,9,1.21,61.8,56.2,6.78,6.86,4.2,5604
2,3,4,2,2,1.7,61.8,61.0,7.59,7.52,4.67,11848
3,4,3,3,1,0.9,63.1,55.0,6.16,6.13,3.88,3452
4,4,0,4,4,0.62,63.4,55.0,5.46,5.43,3.45,2310


In [35]:
diamond_train_df.shape

(13485, 11)

In [19]:
diamond_test_df = pd.read_csv("../data/test/diamonds_test.csv")
diamond_test_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


In [251]:
diamond_train_df.shape

(40455, 12)

## Pipeline

In [11]:
result_df = pd.DataFrame()

# Definir las columnas con los nombres deseados
column_names = ['model', 'cv_score', 'prediction', 'drop_zeros_var', 'imputation_var', 'remove_outliers_var', 
                'remove_duplicates_var', 'encoder_var', 'feature_ing_var', 'delete_features_var', 'cut', 'color', 'clarity', 
                'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 'avg_girdle', 'table_mm', 'table_depth', 'xyz']

# Asignar las columnas al DataFrame
result_df = result_df.reindex(columns=column_names)

In [12]:
def automate_features_choice(df, pipelines, models, result_df):
    # Combinations
    cut         = [1]
    color       = [1]   # [0, 1]
    clarity     = [1]
    city        = [1]
    carat       = [1]
    depth       = [1]
    table       = [1]
    x           = [1]
    y           = [1]
    z           = [1]
    depth_mm    = [0]
    avg_girdle  = [0]
    table_mm    = [0]
    table_depth = [0]
    xyz         = [0]
    

    # Generar todas las combinaciones posibles
    combinations_features = itertools.product(cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm, 
                                              avg_girdle, table_mm, table_depth, xyz)

    #results = []
    for comb in combinations_features:
        #start_time = time.time()
        
        # Transform train data
        df_transform = version1_without_scaler2(df,
                                                drop_zeros_var=1,
                                                imputation_var=0,
                                                remove_outliers_var=1,
                                                remove_duplicates_var=1,
                                                feature_ing_var=1,
                                                delete_features_var=0)
        
        # Obtain the features to train the model
        cut, color, clarity, city, carat, depth, table, x, y, z, depth_mm,\
            avg_girdle, table_mm, table_depth, xyz = comb
        
        # List of all possible feature names
        features_to_train = []
        all_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 
                       'avg_girdle', 'table_mm', 'table_depth', 'xyz']
        
        # Iterate over features and add them to variables_with_value_1 if they have a value of 1
        for feature in all_features:
            if locals()[feature] == 1:
                features_to_train.append(feature)

        # splitting the dataset in test and train data . The prece will be the Target and the other columns the features
        X = df_transform[features_to_train]
        y = df_transform['price']
        
        # There are combinations in which all values are zero, in that case it isn't necessary train model
        if df_transform.shape[1] >= 1:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
        
            # Fit our models to the training data
            for i in pipelines :
                i.fit(X_train , y_train)

            cv_results = []
            for i, model in enumerate(pipelines):
                # Cross validation
                cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
                cv_results.append(cv_score)

                # Test the result
                pred = model.predict(X_test)

                # Create list with featers used
                features_value_list = list(comb)
                
                # Create list with process parameters
                cv_score_mean = abs(np.mean(cv_results))
                rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
                process_list = [model, cv_score_mean, rmse, 1, 0, 1, 1, 1, 1, 0]
                
                # Create complete list
                registros = process_list + features_value_list
                
                # Store the results in the results dataframe
                result_df.loc[len(result_df.index)] = registros
                print(model, rmse, comb)
                
            #end_time = time.time()
            #print("Execution time: ", end_time - start_time)
    
    return result_df

In [72]:
%%time
pipelines = [rf_pipeline]   #[lr_pipeline, knn_pipline, dt_pipeline, rf_pipeline, xgb_pipeline]
models = ['Random Forest']   #['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGB']

# Execute functions and sort data
result_df = automate_features_choice(diamond_train_df, pipelines, models, result_df)
#play_sound()

pd.set_option('display.max_columns', None)
result_df_sorted = result_df.sort_values(by='prediction')
result_df_sorted.head(20)

Pipeline(steps=[('scaler_5', StandardScaler()),
                ('rf_classifier', RandomForestRegressor())]) 561.9874434455998 (1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1)
CPU times: total: 2min 26s
Wall time: 2min 27s


Unnamed: 0,model,cv_score,prediction,drop_zeros_var,imputation_var,remove_outliers_var,remove_duplicates_var,encoder_var,feature_ing_var,delete_features_var,cut,color,clarity,city,carat,depth,table,x,y,z,depth_mm,avg_girdle,table_mm,table_depth,xyz
2,"(StandardScaler(), (DecisionTreeRegressor(max_...",567.496672,555.614935,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0
3,"(StandardScaler(), (DecisionTreeRegressor(max_...",572.976981,557.117982,0,0,1,1,1,1,0,0,1,1,0,1,0,1,1,1,1,0,0,0,0,0
4,"(StandardScaler(), (DecisionTreeRegressor(max_...",572.404492,557.922636,0,0,1,1,1,1,0,0,1,1,0,1,0,0,1,1,1,0,0,1,0,0
7,"(StandardScaler(), (DecisionTreeRegressor(max_...",568.553369,558.481132,0,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,0,1,0,0
6,"(StandardScaler(), (DecisionTreeRegressor(max_...",569.813371,558.987327,0,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0
5,"(StandardScaler(), (DecisionTreeRegressor(max_...",571.544427,559.83955,0,0,1,1,1,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,0,0
8,"(StandardScaler(), (DecisionTreeRegressor(max_...",567.880659,561.987443,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1
1,"(StandardScaler(), (DecisionTreeRegressor(max_...",1380.499696,1358.302937,0,0,1,1,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,0
0,Random Forest,-1381.38826,1360.670588,0,0,1,1,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,0


# Test

In [139]:
# Observe the non-linear relation between features
X = transformed_df.copy()
y = X.pop("price")

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)
mi_scores

carat_log      1.920812
carat          1.915878
x              1.471532
y              1.470174
xyz            1.454501
z              1.425507
avg_girdle     1.421155
table_mm       1.102496
clarity        0.349953
color          0.276316
cut            0.099420
table          0.053149
depth_mm       0.044567
table_depth    0.039373
depth          0.028235
city           0.003853
Name: MI Scores, dtype: float64

In [14]:
# Function to calculate log
def calculate_log(df, name):
    list_log= []
    for i in df['carat']:
        list_log.append(math.log(i))
    
    new_name = name + '_log'
    df[new_name] = list_log
    return df

# Funtion to classify diamond shape
def classify_shape(df):
    shape = []
    for i in df['table'].index:
        if 54<df['table'][i]<57 and 61<df['depth'][i]<62.5:
            shape.append('Round')
        elif 52<df['table'][i]<60 and 60<df['depth'][i]<68:
            shape.append('Oval')
        elif 63<df['table'][i]<69 and 69<df['depth'][i]<76:
            shape.append('Princess')
        elif 58<df['table'][i]<63 and 58<df['depth'][i]<66:
            shape.append('Cushion')
        else:
            shape.append('others')
            
    df['shape'] = shape
    return df

## Transform and train diamond_train

In [29]:
# Transform
transformed_df = classify_shape(diamond_train_df)
transformed_df = encoder(transformed_df)
transformed_df = drop_zeros(transformed_df)
#transformed_df = imputation(transformed_df)
transformed_df = remove_outliers(transformed_df)
transformed_df = remove_duplicates(transformed_df)
transformed_df = feature_ing(transformed_df)

# default = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price']

all_features = ['cut', 'color', 'clarity', 'city', 'depth','carat', 'depth', 'table', 'x', 'y', 'z', 'depth_mm', 
                'avg_girdle', 'table_mm', 'table_depth', 'xyz', 'price', 'carat_log', 'x_log', 'y_log',
                'z_log', 'ratio_length_width', 'ratio_length_width_depth', 'volume', 'density']
selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'shape']


transformed_df = calculate_log(transformed_df, 'carat')
transformed_df = calculate_log(transformed_df, 'x')
transformed_df = calculate_log(transformed_df, 'y')
transformed_df = calculate_log(transformed_df, 'z')

transformed_df['ratio_length_width'] = transformed_df['x']/transformed_df['y']
transformed_df['ratio_length_width_depth'] = transformed_df['x']/transformed_df['y']/transformed_df['z']
transformed_df['volume'] = transformed_df['x']*transformed_df['y']*transformed_df['z']
transformed_df['density'] = transformed_df['carat']/transformed_df['volume']

#Doesn't make sense to have diamonds with width and depth higher than 20:
#transformed_df = transformed_df.loc[~(transformed_df['carat_log'] == 0 )]

transformed_df_2 = transformed_df[selection_features]
transformed_df_2.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z,price,shape
0,4,2,2,0,0.9,62.6,60.0,6.1,6.14,3.83,3950,0
1,4,2,2,1,0.81,63.1,59.0,5.85,5.79,3.67,2809,1
2,4,3,2,0,0.81,62.5,60.0,5.89,5.94,3.69,2806,0
3,2,6,4,10,1.14,61.5,57.0,6.7,6.73,4.13,5392,1
4,2,6,4,10,1.33,61.3,57.0,7.11,7.08,4.35,6118,1


In [30]:
%%time
"""
# Transform
transformed_df = encoder(diamond_train_df)
transformed_df = drop_zeros(transformed_df)
transformed_df = remove_outliers(transformed_df)
transformed_df = remove_duplicates(transformed_df)
transformed_df = feature_ing(transformed_df)
#scaled_df = stardard_scale_test(transformed_df)

X = transformed_df.drop('price',axis = 1)
y = transformed_df['price']
"""
X = transformed_df_2.drop('price',axis = 1)
y = transformed_df_2['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

# Model

model = RandomForestRegressor(random_state=42,
                              n_estimators=500,
                              #max_features='log2',
                              n_jobs=-1)
"""
model = GradientBoostingRegressor(random_state = 42,
                                  n_estimators=100)
"""

# Cross validation
cv_results = []
cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Prints
hyperparameters = model.get_params()
cv_score_mean = abs(np.mean(cv_results))
rmse = mean_squared_error(y_test, y_pred)**0.5

print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean, ' | rmse:', rmse)

Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  | cv_score_mean: 590.2481135772182  | rmse: 571.9045673014758
CPU times: total: 5min 41s
Wall time: 46.5 s


In [44]:
model_optimaze = model

In [None]:
# Features = ['cut', 'color', 'clarity', 'city', 'depth', 'table', 'carat_log', 'x_log', 'y_log', 'z_log',
#             'ratio_length_width', 'ratio_length_width_depth', 'volume', 'density', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
# Submission = 597
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 563.6224504060208  | rmse: 541.8872683394235
CPU times: total: 17min 7s
Wall time: 2min 22s

# Features = ['cut', 'color', 'clarity', 'city', 'depth', 'table', 'carat_log', 
#             'ratio_length_width', 'ratio_length_width_depth', 'volume', 'density', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500
# Submission = 597
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 563.1380099851094  | rmse: 541.9037907071685
CPU times: total: 14min 32s
Wall time: 1min 56s

# Features = ['cut', 'color', 'clarity', 'city', 'depth', 'table','x', 'y', 'z', 'carat_log', 
#             'ratio_length_width', 'ratio_length_width_depth', 'volume', 'density', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 568.2830813092132  | rmse: 545.7160325766089
CPU times: total: 18min 18s
Wall time: 2min 32s

# Features = ['cut', 'color', 'clarity', 'carat_log', 'depth', 'table', 'price']
# encoding, imputation, remove_outliers y remove_duplicates | n_stimators=500
# Submission = 552
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 578.5952143324417  | rmse: 552.891156245962
CPU times: total: 4min 37s
Wall time: 40.2 s

# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500
# Submission = 543---------------------------------------------------------------
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.0212938830194  | rmse: 552.9419513838835  | rmse_2: 552.9419513838835
CPU times: total: 9min 28s
Wall time: 1min 22s
    
# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=600
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 600, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 569.6888950649824  | rmse: 553.1880851479532  | rmse_2: 553.1880851479532
CPU times: total: 11min 53s
Wall time: 1min 40s

# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=400
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.392820611711  | rmse: 553.2600932696147  | rmse_2: 553.2600932696147
CPU times: total: 4min 35s
Wall time: 42.8 s

# Features = ['cut', 'color', 'clarity', 'city', 'depth', 'table', 'x', 'y', 'z', 'carat_log', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.336771427553  | rmse: 553.3567161129744
CPU times: total: 9min 54s
Wall time: 1min 32s

# Features = ['cut', 'color', 'clarity', 'city', 'depth', 'table', 'x', 'y', 'z', 'carat_log', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.2595671275265  | rmse: 553.4054287603929
CPU times: total: 9min 10s
Wall time: 1min 14s
    
# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=400
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 573.2463024252991  | rmse: 553.7944242193641  | rmse_2: 553.7944242193641
CPU times: total: 1min 27s
Wall time: 13.4 s
    
# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=400
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 574.2712643507491  | rmse: 554.488321315124  | rmse_2: 554.488321315124
CPU times: total: 1min 23s
Wall time: 13.1 s

# Features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'shape']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500
# Submission = 542 ----------------------------------------------------------
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.1513571076711  | rmse: 554.7242488195719
CPU times: total: 10min 2s
Wall time: 1min 25s
    
# Features de serie, encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=400
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 400, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 568.2230549678686  | rmse: 555.2393808873512  | rmse_2: 555.2393808873512
CPU times: total: 15min 42s
Wall time: 2min 12s
    
# Features = ['cut', 'color', 'clarity', 'city', 'carat', 'table','x', 'y', 'z', 'carat_log', 'price'] 
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 571.09845582581  | rmse: 555.2875203635209  | rmse_2: 555.2875203635209
CPU times: total: 8min 40s
Wall time: 1min 11s

# Features = ['cut', 'color', 'clarity', 'city', 'carat_log', 'table','x', 'y', 'z', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=50
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.9756633240581  | rmse: 555.6053281565353  | rmse_2: 555.6053281565353
CPU times: total: 8min 43s
Wall time: 1min 15s
    
# Features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'x', 'y', 'z','avg_girdle', 'table_mm', 'xyz', 'price'] 
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500   
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 567.5032514631432  | rmse: 555.7750839709606  | rmse_2: 555.7750839709606
CPU times: total: 14min 5s
Wall time: 1min 53s

# Features = ['cut', 'color', 'clarity', 'carat_log', 'table','x', 'y', 'z', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=50
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  | cv_score_mean: 569.945980050026  | rmse: 556.4799516795906  | rmse_2: 556.4799516795906
CPU times: total: 6min 35s
Wall time: 55 s

# Features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'x', 'y', 'z', 'table_mm', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=50
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 570.4899544500876  | rmse: 556.0081586081567  | rmse_2: 556.0081586081567
CPU times: total: 10min 40s
Wall time: 1min 30s

# Features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'x', 'y', 'z', 'avg_girdle', 'table_mm', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=50
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 569.6400678426654  | rmse: 556.1522654109002  | rmse_2: 556.1522654109002
CPU times: total: 12min 2s
Wall time: 1min 44s

# Features = [['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'avg_girdle', 'table_mm', 'xyz', 'price']] 
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 571.0132381609183  | rmse: 562.8086785378399  | rmse_2: 562.8086785378399
CPU times: total: 10min 44s
Wall time: 1min 26s

# Features = ['cut', 'color', 'clarity', 'city', 'carat_log', 'depth', 'table','x_log', 'y_log', 'z_log', 'price']
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500 
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 578.4665781312609  | rmse: 563.1005194121811
CPU times: total: 7min 48s
Wall time: 1min 7s
    
# Features de serie y solo encoding | n_stimators=400
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 568.7680257934356  | rmse: 587.9591159724442  | rmse_2: 587.9591159724442
CPU times: total: 1min 2s
Wall time: 1min 3s
    
# Features = ['color', 'carat', 'table', 'x', 'y', 'z', 'price'] 
# encoding, drop_zeros, remove_outliers y remove_duplicates | n_stimators=500
Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  
    | cv_score_mean: 1193.7263388731408  | rmse: 1176.9075341911703  | rmse_2: 1176.9075341911703
CPU times: total: 6min 9s
Wall time: 53.4 s

## Transform test data and obtain the prediction to upload in kaggle

In [36]:
# Transform
def transformation_data(df, type_data):
    trans_df = classify_shape(df)
    trans_df = encoder(trans_df)
    trans_df = imputation(trans_df)

    if type_data == 'train_data':
        #trans_df = drop_zeros(trans_df)
        trans_df = remove_outliers(trans_df)
        trans_df = remove_duplicates(trans_df)
        selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'shape', 'price']
        
        #selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table','x', 'y', 'z', 'price']
    if type_data == 'test_data':
        selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z', 'shape']
        #selection_features = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z']
    trans_df = feature_ing(trans_df)

    trans_df = calculate_log(trans_df, 'carat')
    trans_df = calculate_log(trans_df, 'x')
    trans_df = calculate_log(trans_df, 'y')
    trans_df = calculate_log(trans_df, 'z')


    trans_df['ratio_length_width'] = trans_df['x']/trans_df['y']
    trans_df['ratio_length_width_depth'] = trans_df['x']/trans_df['y']/trans_df['z']
    trans_df['volume'] = trans_df['x']*trans_df['y']*trans_df['z']
    trans_df['density'] = trans_df['carat']/trans_df['volume']
    
    trans_df_2 = trans_df[selection_features]
    trans_df_2.head()
    
    return trans_df_2

In [38]:
diamond_train_df.shape

(13485, 12)

In [37]:
%%time

# Transform diamond_train_df
train_df = transformation_data(diamond_train_df, 'train_data')
X_train = train_df.drop('price',axis = 1)
y_train = train_df['price']

# Transform diamond_test_df
X_test = transformation_data(diamond_test_df, 'test_data')

# Model
model = RandomForestRegressor(random_state=42,
                              n_estimators=500,
                              #max_features='log2',
                              n_jobs=-1)

# Cross validation
cv_results = []
cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Prints
hyperparameters = model.get_params()
cv_score_mean = abs(np.mean(cv_results))
#rmse = mean_squared_error(y_test, y_pred)**0.5

print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)   #, ' | rmse:', rmse)

Hyperparameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}  | cv_score_mean: 610.2152060900632
CPU times: total: 3min 8s
Wall time: 26.7 s


In [39]:
len(y_pred)

13485

In [40]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('submission.csv', index=False)

# Optimize the model

In [45]:
param_grid = {'n_estimators': [500],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

grid_search = GridSearchCV(model_optimaze,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [46]:
%%time

grid_search.fit(X,y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500} 

Best score:  574.9872926157677 

CPU times: total: 1min 42s
Wall time: 7min 57s
