In [200]:
import pandas as pd

In [201]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score, mean_squared_error

In [202]:
diamonds = pd.read_csv('diamonds_train.csv')
diamonds_predict = pd.read_csv('diamonds_predict.csv')

In [203]:
#diamonds = diamonds.loc[~((diamonds['y'] > 20) | (diamonds['z'] > 20))]

In [204]:
diamonds['ratio_length_width'] = diamonds['x']/diamonds['y']
diamonds_predict['ratio_length_width'] = diamonds_predict['x']/diamonds_predict['y']

In [205]:
diamonds['ratio_length_width_depth'] = diamonds['x']/diamonds['y']/diamonds['z']
diamonds_predict['ratio_length_width_depth'] = diamonds_predict['x']/diamonds_predict['y']/diamonds_predict['z']

diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']
diamonds_predict['volume'] = diamonds_predict['x']*diamonds_predict['y']*diamonds_predict['z']

diamonds['density'] = diamonds['carat']/diamonds['volume']
diamonds_predict['density'] = diamonds_predict['carat']/diamonds_predict['volume']

In [206]:
shape = []
for i in diamonds['table'].index:
    if 54<diamonds['table'][i]<57 and 61<diamonds['depth'][i]<62.5:
        shape.append('Round')
    elif 52<diamonds['table'][i]<60 and 60<diamonds['depth'][i]<68:
        shape.append('Oval')
    elif 63<diamonds['table'][i]<69 and 69<diamonds['depth'][i]<76:
        shape.append('Princess')
    elif 58<diamonds['table'][i]<63 and 58<diamonds['depth'][i]<66:
        shape.append('Cushion')
    else:
        shape.append('others')

In [207]:
diamonds['shape'] = shape

In [208]:
shape = []
for i in diamonds_predict['table'].index:
    if 54<diamonds_predict['table'][i]<57 and 61<diamonds_predict['depth'][i]<62.5:
        shape.append('Round')
    elif 52<diamonds_predict['table'][i]<60 and 60<diamonds_predict['depth'][i]<68:
        shape.append('Oval')
    elif 63<diamonds_predict['table'][i]<69 and 69<diamonds_predict['depth'][i]<76:
        shape.append('Princess')
    elif 58<diamonds_predict['table'][i]<63 and 58<diamonds_predict['depth'][i]<66:
        shape.append('Cushion')
    else:
        shape.append('others')

In [209]:
diamonds_predict['shape'] = shape

In [210]:
import math

carat_log= []
for i in diamonds['carat']:
    carat_log.append(math.log(i))
diamonds['carat_log'] = carat_log

carat_log= []
for i in diamonds_predict['carat']:
    carat_log.append(math.log(i))
diamonds_predict['carat_log'] = carat_log

In [278]:
NUM_FEATS = ['carat','table','depth','x','y','z','ratio_length_width','carat_log']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [279]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [280]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [281]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [282]:
from sklearn.model_selection import train_test_split

In [283]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [284]:
from lightgbm import LGBMRegressor

In [306]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor',LGBMRegressor())])


In [307]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

In [308]:
from sklearn.metrics import r2_score

In [309]:
y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])

In [294]:
print(f"test error: {r2_score(y_pred=y_test, y_true=diamonds_test[TARGET])}")
print(f"train error: {r2_score(y_pred=y_train, y_true=diamonds_train[TARGET])}")

test error: 0.9816750499702188
train error: 0.9864637128048624


In [295]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")

test error: 542.6425881577434
train error: 463.85717415726253


In [296]:
from sklearn.model_selection import cross_val_score

In [297]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [298]:
import numpy as np
np.mean(-scores)

541.6507824541164

In [299]:
from sklearn.model_selection import RandomizedSearchCV

In [300]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   15.0s finished




RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'table',
                                                                                'depth',
                                                                                'x',
              

In [301]:
grid_search.best_params_

{'regressor__n_estimators': 256,
 'regressor__max_depth': 8,
 'preprocessor__num__imputer__strategy': 'mean'}

In [302]:
grid_search.best_score_

-539.1568902077146

In [303]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [304]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [305]:
submission_df.to_csv('diamonds_lgbm_3.csv', index=False)