In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [2]:
diamonds = pd.read_csv('data/diamonds_train.csv')
predict = pd.read_csv('data/diamonds_test.csv')

In [3]:
TARGET = 'price'

CAT_FEATS = ['cut', 'color', 'clarity']
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']

FEATURES = NUM_FEATS + CAT_FEATS

In [4]:
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                       ('scaler', MinMaxScaler())])

In [5]:
numerical_transformer.fit_transform(diamonds[NUM_FEATS])

array([[0.23488372, 0.53888889, 0.28846154, 0.66764418, 0.11528014,
        0.52729529],
       [0.02790698, 0.55555556, 0.26923077, 0.42521994, 0.07436333,
        0.34119107],
       [0.11860465, 0.625     , 0.23076923, 0.54936461, 0.09388795,
        0.4528536 ],
       ...,
       [0.18837209, 0.54722222, 0.25      , 0.6226784 , 0.1089983 ,
        0.49751861],
       [0.03023256, 0.525     , 0.21730769, 0.43499511, 0.07589134,
        0.34243176],
       [0.24186047, 0.52777778, 0.28846154, 0.66764418, 0.11680815,
        0.52729529]])

In [6]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value = 'missing')),
                                       ('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [7]:
categorical_transformer.fit_transform(diamonds[CAT_FEATS])

<40455x20 sparse matrix of type '<class 'numpy.float64'>'
	with 121365 stored elements in Compressed Sparse Row format>

In [8]:
preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor', numerical_transformer, NUM_FEATS ),
                                              ('categorical_preprocessor', categorical_transformer, CAT_FEATS)])

In [9]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds[FEATURES]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.234884,0.538889,0.288462,0.667644,0.115280,0.527295,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.027907,0.555556,0.269231,0.425220,0.074363,0.341191,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.118605,0.625000,0.230769,0.549365,0.093888,0.452854,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.048837,0.577778,0.250000,0.457478,0.080136,0.372208,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.190698,0.486111,0.307692,0.640274,0.110526,0.490074,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.265116,0.547222,0.269231,0.694037,0.119525,0.549628,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
40451,0.423256,0.391667,0.326923,0.812317,0.140068,0.586849,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
40452,0.188372,0.547222,0.250000,0.622678,0.108998,0.497519,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
40453,0.030233,0.525000,0.217308,0.434995,0.075891,0.342432,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

In [11]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [12]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor(n_jobs=-1))])

In [13]:
X_train = diamonds_train[FEATURES]
y_train = diamonds_train[TARGET]

In [14]:
model.fit(X_train, y_train);

In [15]:
y_train_predict = model.predict(X_train)

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
mean_squared_error(y_true=y_train, y_pred=y_train_predict, squared=False)

474.37593302356487

In [18]:
X_test = diamonds_test[FEATURES]
y_test = diamonds_test[TARGET]

In [19]:
y_test_predict = model.predict(X_test)

In [20]:
mean_squared_error(y_true=y_test, y_pred=y_test_predict, squared=False)

532.3822283554318

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
X = diamonds[FEATURES]
y = diamonds[TARGET]

In [23]:
cross_val_score(model, X, y, scoring = 'neg_root_mean_squared_error', cv=4, n_jobs=-1).mean()

-548.4332669922659

In [24]:
y_submission = model.predict(predict[FEATURES])

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
parameter_grid = {'regressor__max_depth' : [2,4,6,8,11,16,20,24,28,32],
                  'regressor__n_estimators' : [64, 128, 256, 512, 1024],
                  'regressor__learning_rate' : [0.1, 0.01, 0.001],
                  'preprocessor__numerical_preprocessor__imputer__strategy' : ['mean', 'median']}

In [27]:
grid_search = RandomizedSearchCV(model, parameter_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, n_iter=20)

In [28]:
grid_search.fit(X,y);



In [29]:
submission = grid_search.predict(predict[FEATURES])

In [30]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': submission})

In [31]:
submission_df.price.clip(0, 20000, inplace=True)

In [32]:
submission_df.to_csv('submissions/submission_lightgbm_minmaxscaler_2.csv', index=False)