In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dapt202011mad/sample_submission.csv
/kaggle/input/dapt202011mad/diamonds_test.csv
/kaggle/input/dapt202011mad/diamonds_train.csv


In [2]:
train = pd.read_csv('../input/dapt202011mad/diamonds_train.csv')
test = pd.read_csv('../input/dapt202011mad/diamonds_test.csv')
sample_sub = pd.read_csv('../input/dapt202011mad/sample_submission.csv')

In [3]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

for cat_feat in cat_features:
    train[cat_feat] = train[cat_feat].astype('category')
    test[cat_feat] = test[cat_feat].astype('category')
    
# cat_df = pd.get_dummies(train[cat_features])
# num_df = train.loc[:,num_features]
# train_df = pd.concat([cat_df, num_df], axis=1)

# cat_df = pd.get_dummies(test[cat_features])
# num_df = test.loc[:,num_features]
# test_df = pd.concat([cat_df, num_df], axis=1)


# features = list(cat_df.columns) + list(num_df.columns)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


In [5]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [6]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [7]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])

In [8]:
features=num_features+cat_features
X=train[features]
y=train[target]

In [26]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', GradientBoostingRegressor())])
model.fit(X=X, y=y)
print("model created!")

model created!


In [24]:
y_train = model.predict(X)

# Metricas

In [11]:
from sklearn.metrics import mean_squared_error

In [27]:
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=y, squared=False)}")

train error: 207.0260781966406


# Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, verbose=2)
np.mean(-scores)

# Grid search

In [29]:
from sklearn.model_selection import RandomizedSearchCV

In [42]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__learning_rate': [0.2],
    'regressor__learning_rate': [0.15,0.2, 0.25, 0.3],
    'regressor__n_estimators': [100,125, 150, 175, 200],
    'regressor__max_depth': [4, 8, 16],
    'regressor__subsample': [1] ,
    'regressor__min_samples_leaf': [1] 
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=2, 
                                 scoring='neg_root_mean_squared_error', 
                                 #n_jobs=-1,
                                 n_iter=32)

grid_search.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END preprocessor__num__imputer__strategy=mean, regressor__learning_rate=0.3, regressor__max_depth=16, regressor__min_samples_leaf=1, regressor__n_estimators=150, regressor__subsample=1; total time=  33.5s
[CV] END preprocessor__num__imputer__strategy=mean, regressor__learning_rate=0.3, regressor__max_depth=16, regressor__min_samples_leaf=1, regressor__n_estimators=150, regressor__subsample=1; total time=  33.8s
[CV] END preprocessor__num__imputer__strategy=mean, regressor__learning_rate=0.3, regressor__max_depth=16, regressor__min_samples_leaf=1, regressor__n_estimators=150, regressor__subsample=1; total time=  33.3s
[CV] END preprocessor__num__imputer__strategy=mean, regressor__learning_rate=0.3, regressor__max_depth=16, regressor__min_samples_leaf=1, regressor__n_estimators=150, regressor__subsample=1; total time=  33.1s
[CV] END preprocessor__num__imputer__strategy=mean, regressor__learning_rate=0.3, regressor__max_d

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [43]:
grid_search.best_params_

{'regressor__subsample': 1,
 'regressor__n_estimators': 125,
 'regressor__min_samples_leaf': 1,
 'regressor__max_depth': 8,
 'regressor__learning_rate': 0.15,
 'preprocessor__num__imputer__strategy': 'mean'}

In [44]:
-grid_search.best_score_

549.977089446955

# **Submission**

In [45]:
y_pred = grid_search.predict(test[features])
submission_df = pd.DataFrame({'id': test['id'], 'price': y_pred})
submission_df.head()
submission_df.to_csv('submission_GBM_grid.csv', index=False)