## Imports

In [2]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import time
from IPython.display import display, Javascript
import itertools
import duckdb
import pandas as pd
import numpy as np
import math

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

# Data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PowerTransformer

# Train
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Models
import umap
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')

## Functions

In [3]:
def encoder(df):
    df_enc = df.copy()

    # Obtain the dataframe encoded
    for column in df.columns:
        if df[column].dtype == 'object':
            enc_label = LabelEncoder()
            df_enc[column] = enc_label.fit_transform(df[column])
    return df_enc

def drop_zeros(df):
    df = df.drop(df[df['x'] == 0].index)
    df = df.drop(df[df['y'] == 0].index)
    df = df.drop(df[df['z'] == 0].index)
    return df

## Import data

In [21]:
# create a connection to a file called 'file.db'
con = duckdb.connect("../data/train/diamonds_train.db")

# Query to extract data from database
query_full = """
SELECT
    --tra.index_id,
    cut.cut,
    col.color,
    cla.clarity,
    tra.price,
    cit.city,
    tra.carat,
    dim.depth,
    dim.table,
    dim.x,
    dim.y,
    dim.z
FROM diamonds_properties AS pro
JOIN diamonds_cut AS cut ON pro.cut_id = cut.cut_id
JOIN diamonds_color AS col ON pro.color_id = col.color_id
JOIN diamonds_clarity AS cla ON pro.clarity_id = cla.clarity_id
JOIN diamonds_transactional as tra ON pro.index_id = tra.index_id
JOIN diamonds_city AS cit ON tra.city_id = cit.city_id
JOIN diamonds_dimensions AS dim ON pro.index_id = dim.index_id
"""

diamond_train_df = con.execute(query_full).df()
diamond_train_df.head()

Unnamed: 0,cut,color,clarity,price,city,carat,depth,table,x,y,z
0,Premium,J,VS2,4268,Dubai,1.21,62.4,58.0,6.83,6.79,4.25
1,Very Good,H,VS2,505,Kimberly,0.32,63.0,57.0,4.35,4.38,2.75
2,Fair,G,VS1,2686,Las Vegas,0.71,65.5,55.0,5.62,5.53,3.65
3,Good,D,SI1,738,Kimberly,0.41,63.8,56.0,4.68,4.72,3.0
4,Ideal,G,SI1,4882,Dubai,1.02,60.5,59.0,6.55,6.51,3.95


In [5]:
diamond_test_df = pd.read_csv("../data/test/diamonds_test.csv")
diamond_test_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


In [6]:
parameters_df = pd.read_csv('./parameters_training/best_parameters_prediction_models.csv')
parameters_df.head()

Unnamed: 0,Model,cv_score,rmse,Submission,Features,Transformations,Estimators,Hyperparameters
0,Voting model,534.5,529.8,538,"['cut', 'color', 'clarity', 'city', 'carat', '...","['encoding', 'drop_zeros', 'remove_outliers', ...","['lgbm', 'xgb', 'extrees', 'rf']",{}
1,RF,563.6,541.8,597,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
2,RF,563.1,541.9,597,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
3,RF,568.2,545.7,0,"['cut', 'color', 'clarity', 'city', 'depth', '...","['encoding', 'drop_zeros', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
4,RF,578.5,552.8,552,"['cut', 'color', 'clarity', 'carat_log', 'dept...","['encoding', 'imputation', 'remove_outliers', ...",[None],"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."


## Train

In [7]:
param = {'colsample_bytree': 0.95, 'gamma': 0.14, 'learning_rate': 0.012, 'max_depth': 7, 'missing': np.inf, 'n_estimators': 1130, 'subsample': 0.8}
xgb_model = XGBRegressor(**param)
model = xgb_model

In [22]:
#diamond_train_df
diamond_train_df = diamond_train_df[['carat','cut','color','clarity','depth','table','x','y','z','city','price']]
transformed_df = encoder(diamond_train_df)
transformed_df = drop_zeros(transformed_df)
X = transformed_df.drop('price',axis = 1)
y = transformed_df['price']

In [26]:
#diamond_test_df
transformed_df_test = diamond_test_df.drop(['id'], axis=1)
X_test = encoder(transformed_df_test)
X_test.shape

(13485, 10)

In [24]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

# Cross validation
cv_results = []
cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

# Train
model.fit(X_train,y_train)
# Predict
y_pred = model.predict(X_test)
# Prints
hyperparameters = model.get_params()
cv_score_mean = abs(np.mean(cv_results))
rmse = mean_squared_error(y_test, y_pred)**0.5

print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean, ' | rmse:', rmse)

Hyperparameters:  {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.95, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0.14, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.012, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 7, 'max_leaves': None, 'min_child_weight': None, 'missing': inf, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 1130, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}  | cv_score_mean: 544.8587207086467  | rmse: 530.5178452981997
CPU times: total: 3mi

In [27]:
# Predict
y_pred = model.predict(X_test)
len(y_pred)

13485

In [29]:
# Create and store the dataframe to upload to kaggle
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('../data/submisions/XGB_only_dropzero_2.csv', index=False)

In [None]:
df.rename(columns={'x': 'length', 'y': 'width', 
                   'depth': 'depth%', 'z':'depth',
                   'table':'table_width'}, inplace=True)

df