# Imports

In [2]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import itertools
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import DBSCAN
import umap

import sklearn
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')

## Imports data

In [1]:
# create a connection to a file called 'file.db'
con = duckdb.connect("../data/train/diamonds_train.db")

# Query to extract data from database
query_full = """
SELECT
    --tra.index_id,
    cut.cut,
    col.color,
    cla.clarity,
    tra.price,
    cit.city,
    tra.carat,
    dim.depth,
    dim.table,
    dim.x,
    dim.y,
    dim.z
FROM diamonds_properties AS pro
JOIN diamonds_cut AS cut ON pro.cut_id = cut.cut_id
JOIN diamonds_color AS col ON pro.color_id = col.color_id
JOIN diamonds_clarity AS cla ON pro.clarity_id = cla.clarity_id
JOIN diamonds_transactional as tra ON pro.index_id = tra.index_id
JOIN diamonds_city AS cit ON tra.city_id = cit.city_id
JOIN diamonds_dimensions AS dim ON pro.index_id = dim.index_id
"""

df_train = con.execute(query_full).df()
df_train.head()

NameError: name 'duckdb' is not defined

In [4]:
df_test = pd.read_csv("../data/test/diamonds_test.csv")
df_test.drop(columns='id', inplace=True)
df_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city
0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


# Transform data

In [27]:
#type_trans = 'train' or 'test'

def transformations(df, type_trans):
    if type_trans == 'train':
        df.drop_duplicates(inplace=True)
    
    # Rename columns
    #df.rename(columns={'x': 'length', 'y': 'width', 'depth': 'depth','table':'table_width'}, inplace=True)
    
    # Encode
    cut_categories = df['cut'].unique()
    color_categories = df['color'].unique()
    clarity_categories = df['clarity'].unique()
    city_categories = df['city'].unique()
    
    # Change to categorical
    df['cut'] = pd.Categorical(df['cut'], categories= cut_categories, ordered=True)
    df['color'] = pd.Categorical(df['color'], categories= color_categories, ordered=True)
    df['clarity'] = pd.Categorical(df['clarity'], categories= clarity_categories, ordered=True)
    df['city'] = pd.Categorical(df['city'], categories= city_categories, ordered=True)
    
    cat_cols = ['cut','color','clarity', 'city']
    cat_orders = [cut_categories, color_categories, clarity_categories, city_categories]
    encoder = OrdinalEncoder(categories=cat_orders)
    cats_encoded = pd.DataFrame(encoder.fit_transform(df[cat_cols]), columns = ['cut_encoded','color_encoded',
                                                                                'clarity_encoded','city_encoded'])
    
    # Store encoded columns
    df_encoded = df.drop(columns=['cut','color','clarity','city']).copy()
    df_encoded['cut_encoded'] = df['cut'].cat.codes
    df_encoded['color_encoded'] = df['color'].cat.codes
    df_encoded['clarity_encoded'] = df['clarity'].cat.codes
    df_encoded['city_encoded'] = df['city'].cat.codes
    
    # Calculate volume
    df['volume'] = df['z'] * df['y'] * df['z']
    df_encoded['volume'] = df['volume']
    
    clean_df = df_encoded.copy()
    if type_trans == 'train':
        outliers_cols = ['x','y','z','table','depth','volume']

        # setting values that above or lower than the whiskers in the box plot to NaNs
        for col in outliers_cols:

            data = clean_df[col]

            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)

            IQR = Q3 - Q1

            min = Q1 - (1.5 * IQR)
            max = Q3 + (1.5 * IQR)

            outliers = ( (data < min) | (data > max) )

            clean_df.loc[outliers, col] = np.nan

        clean_df.isna().sum()
    
    # Remove NaN
    imputer = IterativeImputer(max_iter=50)
    clean_df = pd.DataFrame(imputer.fit_transform(clean_df), columns=clean_df.columns, index=clean_df.index)
    
    # Change cut to cut_encoded...
    df_all = clean_df.copy()
    df_all[['cut','color','clarity','city']] = df[['cut','color','clarity','city']] 
    df = df_all.drop(columns=['cut_encoded','color_encoded','clarity_encoded','city_encoded'])
    
    # Transform dataframe
    Skewed_Cols = clean_df[['carat','volume']].columns
    trans_df = clean_df.copy()
    for col in Skewed_Cols:
        trans_df[col] = np.log(1 + trans_df[col])
    
    # POLYNOMICAL FEATURES
    if type_trans == 'train':
        price_column = trans_df['price']
        df_ploy = trans_df.drop(columns=['price']).copy()
    else:
        df_ploy = trans_df
        
    poly = PolynomialFeatures(2)
    ploy_data = poly.fit_transform(df_ploy)
    df_ploy = pd.DataFrame(ploy_data, columns=poly.get_feature_names_out())
    # Scale dataframe with polynomical
    df_scaled = df_ploy.copy()
    Scaler = StandardScaler()
    scaled_data = Scaler.fit_transform(df_scaled)
    df_scaled = pd.DataFrame(scaled_data, columns=df_scaled.columns)
    
    # WITHOUT POLYNOMICAL FEATURES
    if type_trans == 'train':
        price_column = trans_df['price']
        org_df_scaled = trans_df.drop(columns=['price']).copy()
    else:
        org_df_scaled = trans_df
        
    Scaler = StandardScaler()
    scaled_data = Scaler.fit_transform(org_df_scaled)
    org_df_scaled = pd.DataFrame(scaled_data, columns=org_df_scaled.columns)
    
    if type_trans == 'train':
        return df_scaled, org_df_scaled, price_column 
    else:
        return df_scaled, org_df_scaled
    

In [28]:
#type_trans = 'train' or 'test'
df_train_trans, df_train_trans_no_ploy, price_column = transformations(df_train, 'train')
df_test_trans, df_test_trans_no_ploy = transformations(df_test, 'test') 

# Fine-Tuning

In [29]:
X = df_train_trans_no_ploy
y = price_column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [38]:
#param_Rodri = {'colsample_bytree': 0.95, 'gamma': 0.14, 'learning_rate': 0.012, 'max_depth': 7, 'missing': np.inf, 
#               'n_estimators': 1130, 'subsample': 0.8}

param_grid = {'n_estimators': [1300],  # Number of trees in the forest.
              'max_depth': [5],  # Maximum depth of the trees.
              'subsample': [1],
              'colsample_bytree': [0.75],
              'lambda': [0.12],
              'gamma': [0.12],
              'learning_rate': [0.02]
              }

xgb_reg = XGBRegressor(random_state=0)

xgb_grid_search = GridSearchCV(xgb_reg, param_grid, cv=3, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

xgb_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', xgb_grid_search.best_params_, '\n')
print('Best score: ', xgb_grid_search.best_score_, '\n')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
-551.979028568908


{'colsample_bytree': 0.75,
 'gamma': 0.12,
 'lambda': 0.12,
 'learning_rate': 0.02,
 'max_depth': 5,
 'n_estimators': 1300,
 'subsample': 1}

In [48]:
param_grid = [
 {'num_leaves': [20], 'learning_rate' : [0.01] ,
  'n_estimators' : [2650]},]

lgbm_reg = LGBMRegressor(random_state=0)

lgbm_grid_search = GridSearchCV(lgbm_reg, param_grid, cv=3, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

lgbm_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', lgbm_grid_search.best_params_, '\n')
print('Best score: ', lgbm_grid_search.best_score_, '\n')

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 40445, number of used features: 11
[LightGBM] [Info] Start training from score 3928.215923


Best hyperparameters:  {'learning_rate': 0.01, 'n_estimators': 2650, 'num_leaves': 20} 

Best score:  -544.9910524913057 



In [50]:
param_grid = [{ 'n_estimators': [500], 'max_depth' : [None],
                'max_features' : ['sqrt']}]

extrees_reg = ExtraTreesRegressor(random_state=0)

extrees_grid_search = GridSearchCV(extrees_reg, param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

extrees_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', extrees_grid_search.best_params_, '\n')
print('Best score: ', extrees_grid_search.best_score_, '\n')

Fitting 3 folds for each of 3 candidates, totalling 9 fits


Best hyperparameters:  {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500} 

Best score:  -696.7412070404639 



# Train model

### Parameters

In [65]:
# XGBRegressor
xgb_param_grid = {'n_estimators': 1300,  # Number of trees in the forest.
              'max_depth': 5,  # Maximum depth of the trees.
              'subsample': 1,
              'colsample_bytree': 0.75,
              'lambda': 0.12,
              'gamma': 0.12,
              'learning_rate': 0.02,
              'random_state':0
              }
xgb_model = XGBRegressor(**xgb_param_grid)
#xgb_model.fit(X_train,y_train)

# LGBMRegresor
lgb_param_grid = {'num_leaves': 20, 
                  'learning_rate' : 0.01,
                  'n_estimators' : 2650,
                  'random_state': 0
                 }
#lgb_model = LGBMRegressor(num_leaves=20, learning_rate=0.01,n_estimators=2650,random_state': 0)
lgb_model = LGBMRegressor(**lgb_param_grid)
#lgb_model.fit(X_train,y_train)

# Extrees
extrees_param_grid = {'n_estimators': 500, 
                      'max_depth' : None,
                      'max_features' : 'sqrt',
                      'random_state':0
                     }
#extrees_model = ExtraTreesRegressor(n_estimators=500, max_depth=None,max_features='sqrt',random_state=0)
extrees_model = ExtraTreesRegressor(**extrees_param_grid)
#extrees_model.fit(X_train,y_train)

### Train model - stacking

In [67]:
%%time
estimators = [('lgbm1', xgb_model),
              ('xgb1', lgb_model),
              ('extrees', extrees_model),]

final_estimator = XGBRegressor()  # Definir el estimador final

stack_model = StackingRegressor(estimators=estimators, #final_estimator=final_estimator,
                                cv=None, n_jobs=-1, verbose=True, passthrough=True)

cv_results = []
cv_score = cross_val_score(stack_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

hyperparameters = model.get_params()
cv_score_mean = abs(np.mean(cv_results))
print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)

stack_model.fit(X,y)
hyperparameters_all = stack_model.get_params()

NameError: name 'model' is not defined

### Train model - RGB

### Obtain prediction

In [None]:
y_pred = model_all.predict(df_test_trans_no_ploy)
len(y_pred)

In [None]:
# Create and store the dataframe to upload to kaggle
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('../data/submisions/XGB_all_data.csv', index=False)