In [5]:
import pandas as pd

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
diamonds = pd.read_csv('diamonds_train.csv')
diamonds_predict = pd.read_csv('diamonds_predict.csv')

In [591]:
#diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0) | (diamonds['z']>0)]

In [8]:
diamonds = diamonds.loc[~((diamonds['y'] > 20) | (diamonds['z'] > 20))]

In [9]:
diamonds['ratio_length_width'] = diamonds['x']/diamonds['y']
diamonds_predict['ratio_length_width'] = diamonds_predict['x']/diamonds_predict['y']

In [10]:
diamonds['ratio_length_width_depth'] = diamonds['x']/diamonds['y']/diamonds['z']
diamonds_predict['ratio_length_width_depth'] = diamonds_predict['x']/diamonds_predict['y']/diamonds_predict['z']

diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']
diamonds_predict['volume'] = diamonds_predict['x']*diamonds_predict['y']*diamonds_predict['z']

diamonds['density'] = diamonds['carat']/diamonds['volume']
diamonds_predict['density'] = diamonds_predict['carat']/diamonds_predict['volume']

In [11]:
shape = []
for i in diamonds['table'].index:
    if 54<diamonds['table'][i]<57 and 61<diamonds['depth'][i]<62.5:
        shape.append('Round')
    elif 52<diamonds['table'][i]<60 and 60<diamonds['depth'][i]<68:
        shape.append('Oval')
    elif 63<diamonds['table'][i]<69 and 69<diamonds['depth'][i]<76:
        shape.append('Princess')
    elif 58<diamonds['table'][i]<63 and 58<diamonds['depth'][i]<66:
        shape.append('Cushion')
    else:
        shape.append('others')

In [12]:
diamonds['shape'] = shape

In [13]:
shape = []
for i in diamonds_predict['table'].index:
    if 54<diamonds_predict['table'][i]<57 and 61<diamonds_predict['depth'][i]<62.5:
        shape.append('Round')
    elif 52<diamonds_predict['table'][i]<60 and 60<diamonds_predict['depth'][i]<68:
        shape.append('Oval')
    elif 63<diamonds_predict['table'][i]<69 and 69<diamonds_predict['depth'][i]<76:
        shape.append('Princess')
    elif 58<diamonds_predict['table'][i]<63 and 58<diamonds_predict['depth'][i]<66:
        shape.append('Cushion')
    else:
        shape.append('others')

In [14]:
diamonds_predict['shape'] = shape

In [15]:
import math

carat_log= []
for i in diamonds['carat']:
    carat_log.append(math.log(i))
diamonds['carat_log'] = carat_log

carat_log= []
for i in diamonds_predict['carat']:
    carat_log.append(math.log(i))
diamonds_predict['carat_log'] = carat_log

In [12]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,ratio_length_width,ratio_length_width_depth,volume,density,shape,carat_log
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,1.005891,0.236680,197.096725,0.006139,Oval,0.190620
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,0.993151,0.361146,52.395750,0.006107,Oval,-1.139434
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,1.016275,0.278431,113.436890,0.006259,Oval,-0.342490
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00,0.991525,0.330508,66.268800,0.006187,Oval,-0.891598
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,1.006144,0.254720,168.429975,0.006056,Oval,0.019803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43,1.008523,0.227658,221.429120,0.006052,Oval,0.292670
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73,1.007273,0.212954,324.276975,0.006229,others,0.703098
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01,0.992212,0.247434,163.990554,0.006159,Oval,0.009950
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76,0.995526,0.360698,54.900540,0.006011,Round,-1.108663


In [16]:
NUM_FEATS = ['carat','table','x','y','z','ratio_length_width','carat_log']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [17]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [18]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [19]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [20]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['carat', 'table', 'x', 'y', 'z',
                                  'ratio_length_width', 'carat_log']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [None]:
x_train,y_train,x_test,y_test

In [21]:
from sklearn.model_selection import train_test_split

In [26]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [28]:
diamonds_test[]

(30340, 16)

NameError: name 'diamonds_test' is not defined

In [575]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import ExtraTreesRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', ExtraTreesRegressor())])

In [20]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import ExtraTreesRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                           criterion='mse', max_depth=50,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           max_samples=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=10,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=200, n_jobs=-1,
                                           oob_score=False, random_state=300,
                                           verbose=0, warm_start=False))])

In [362]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

In [363]:
reg = GradientBoostingRegressor(n_estimators = 100, loss='huber', min_samples_leaf=6, max_depth=7,
                                max_leaf_nodes=250, min_samples_split=45
                                )

In [258]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', GradientBoostingRegressor(n_estimators = 100, loss='huber', min_samples_leaf=6, max_depth=7,
                                max_leaf_nodes=250, min_samples_split=45))])

In [21]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

In [382]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

In [22]:
from sklearn.metrics import r2_score

In [23]:
y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])

In [24]:
print(f"test error: {r2_score(y_pred=y_test, y_true=diamonds_test[TARGET])}")
print(f"train error: {r2_score(y_pred=y_train, y_true=diamonds_train[TARGET])}")

test error: 0.9816713331229223
train error: 0.9948538316968566


In [25]:
from sklearn.metrics import mean_squared_error

In [26]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")

test error: 535.1595940037049
train error: 287.32001510558047


In [400]:
y_pred = model.predict(diamonds_predict[FEATS])

In [401]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [20]:
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3955.722955
std,3892.928525,3959.282283
min,0.0,353.77
25%,3371.0,956.94
50%,6742.0,2441.94
75%,10113.0,5304.61
max,13484.0,18602.42


In [402]:
submission_df.to_csv('diamonds_ETR_Params_No_grid.csv', index=False)

In [2]:
from lightgbm import LGBMClassifier

OSError: dlopen(/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [341]:
from sklearn.model_selection import RandomizedSearchCV

In [349]:
from sklearn.model_selection import GridSearchCV

In [386]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = GridSearchCV(model, 
                           param_grid=param_grid,
                           cv=10, 
                           verbose=10, 
                           scoring='neg_root_mean_squared_error',
                          n_jobs=-1)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 25

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['carat',
                                                                          'depth',
                                                                          'x',
                                                                          'y',
                                                                          'z',
    

In [632]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  4.5min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'table',
                                                                                'x',
                                                                                'y',
                  

In [633]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'mean'}

In [634]:
grid_search.best_score_

-529.919599759918

In [588]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [589]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [391]:
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3951.756265
std,3892.928525,3942.755072
min,0.0,374.720159
25%,3371.0,946.589281
50%,6742.0,2465.989175
75%,10113.0,5298.324719
max,13484.0,18301.531587


In [590]:
submission_df.to_csv('diamonds_ETR_LogCarat_2.csv', index=False)