In [8]:
import pandas as pd
import numpy as np

In [9]:
diamonds = pd.read_csv('../data/raw/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/raw/diamonds_predict.csv')

In [10]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [11]:
diamonds['volume'] = diamonds.x*diamonds.y*diamonds.z

In [12]:
diamonds['cut2']=diamonds['cut'].map({'Ideal':1,'Good':2,'Very Good':3,'Fair':4,'Premium':5})
diamonds['color2']=diamonds['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds['clarity2']=diamonds['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})

In [13]:
diamonds['cut/wt']=diamonds['cut2']/diamonds['carat']
diamonds['color/wt']=diamonds['color2']/diamonds['carat']
diamonds['clarity/wt']=diamonds['clarity2']/diamonds['carat']
diamonds = diamonds.drop(['cut2','color2','clarity2'], axis=1)

In [15]:
diamonds['carat_log'] = np.log(diamonds['carat'])

In [16]:
diamonds.shape

(40455, 15)

In [17]:
diamonds_predict.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

In [18]:
diamonds_predict['volume'] = diamonds_predict.x*diamonds_predict.y*diamonds_predict.z

In [19]:
diamonds_predict['cut2']=diamonds_predict['cut'].map({'Ideal':1,'Good':2,'Very Good':3,'Fair':4,'Premium':5})
diamonds_predict['color2']=diamonds_predict['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds_predict['clarity2']=diamonds_predict['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})

In [20]:
diamonds_predict['cut/wt']=diamonds_predict['cut2']/diamonds_predict['carat']
diamonds_predict['color/wt']=diamonds_predict['color2']/diamonds_predict['carat']
diamonds_predict['clarity/wt']=diamonds_predict['clarity2']/diamonds_predict['carat']
diamonds_predict = diamonds_predict.drop(['cut2','color2','clarity2'], axis=1)

In [21]:
diamonds_predict['carat_log'] = np.log(diamonds_predict['carat'])

In [22]:
diamonds_predict.shape

(13485, 15)

In [23]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume,cut/wt,color/wt,clarity/wt,carat_log
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,197.096725,4.132231,5.785124,4.958678,0.19062
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,52.39575,9.375,15.625,18.75,-1.139434
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,113.43689,5.633803,5.633803,5.633803,-0.34249
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,66.2688,4.878049,4.878049,17.073171,-0.891598
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,168.429975,0.980392,3.921569,6.862745,0.019803


In [24]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'volume', 'cut/wt', 'color/wt', 'clarity/wt', 'carat_log']

In [25]:
# datos para entrenar
cat_df_train = pd.get_dummies(diamonds[cat_features])
num_df_train = diamonds[num_features]
X_train = pd.concat([num_df_train, cat_df_train], axis=1)

In [26]:
# datos para predecir
cat_df_predict = pd.get_dummies(diamonds_predict[cat_features])
num_df_predict = diamonds_predict[num_features]
X_predict = pd.concat([num_df_predict, cat_df_predict], axis=1)

In [27]:
# objetivo a entrenar
y_train = diamonds[target].values

* Validation before submission

In [28]:
from sklearn.preprocessing import RobustScaler

In [29]:
scaler = RobustScaler()

In [30]:
X_train_scaled = scaler.fit_transform(X_train)

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_diamondstrain, X_diamondstest, y_diamondstrain, y_diamondstest = train_test_split(X_train_scaled, y_train)

In [33]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

In [34]:
#Gradient Boosting Regression 
model_GB = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

#Random Forest Regression 
model_RF = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#XGBoost 
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

#LightGBM :
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf =11)

In [35]:
model_GB.fit(X_diamondstrain,y_diamondstrain)
model_RF.fit(X_diamondstrain,y_diamondstrain)
model_xgb.fit(X_diamondstrain,y_diamondstrain)
model_lgb.fit(X_diamondstrain,y_diamondstrain)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              feature_fraction=0.2319, feature_fraction_seed=9,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
              objective='regression')

In [36]:
y_pred_model_GB = model_GB.predict(X_diamondstest)
y_pred_model_RF = model_RF.predict(X_diamondstest)
y_pred_model_xgb = model_xgb.predict(X_diamondstest)
y_pred_model_lgb = model_lgb.predict(X_diamondstest)

In [37]:
y_pred=(y_pred_model_GB+y_pred_model_RF+y_pred_model_xgb+y_pred_model_lgb)/4

In [38]:
pd.DataFrame({'predictions': y_pred,
             'reality': y_diamondstest})

Unnamed: 0,predictions,reality
0,642.349382,625
1,1583.752533,1554
2,8515.052239,7809
3,3549.016695,3774
4,5658.988116,6239
...,...,...
10109,1199.769628,1207
10110,4922.184954,4354
10111,1409.334064,1294
10112,2365.661382,2287


* RMSE metric error

In [39]:
from sklearn.metrics import mean_squared_error

In [40]:
mean_squared_error(y_true=y_diamondstest, y_pred=y_pred, squared=False)

547.4135605198566

* r2 metric error

In [41]:
from sklearn.metrics import r2_score

In [42]:
r2_score(y_true=y_diamondstest, y_pred=y_pred)

0.9813033210509563

Conclusion: It looks like a good model! We are going to train now with all the data available in the diamond dataset

In [43]:
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [44]:
model_GB.fit(X_train_scaled,y_train)
model_RF.fit(X_train_scaled,y_train)
model_xgb.fit(X_train_scaled,y_train)
model_lgb.fit(X_train_scaled,y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              feature_fraction=0.2319, feature_fraction_seed=9,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
              objective='regression')

In [45]:
X_predict_scaled = scaler.transform(X_predict)

In [46]:
y_pred_model_GB = model_GB.predict(X_predict_scaled)
y_pred_model_RF = model_RF.predict(X_predict_scaled)
y_pred_model_xgb = model_xgb.predict(X_predict_scaled)
y_pred_model_lgb = model_lgb.predict(X_predict_scaled)

In [47]:
y_pred=((y_pred_model_GB+y_pred_model_RF+y_pred_model_xgb+y_pred_model_lgb)/4).clip(300,19000)

In [48]:
submission = pd.DataFrame({'id': diamonds_predict.id, 
                           'price': y_pred})

In [49]:
submission

Unnamed: 0,id,price
0,0,2930.406073
1,1,5746.012305
2,2,9824.673089
3,3,3924.926093
4,4,1591.580205
...,...,...
13480,13480,1701.746120
13481,13481,2418.433441
13482,13482,3121.152794
13483,13483,2148.841316


In [50]:
submission['price'].describe()

count    13485.000000
mean      3954.113705
std       3943.908202
min        300.000000
25%        938.332541
50%       2456.048034
75%       5361.383333
max      18264.133830
Name: price, dtype: float64

In [51]:
submission.to_csv('submission_MetamodelGBR-RF-XGB-LGB_robustScaler & carat log.csv', index=False)