In [6]:
# Build model for watch price utilizing linear regression and lightgbm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import lightgbm as lgb
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline, Pipeline


# change file path to data
fileName = '../data/train.csv'
df = pd.read_csv(fileName)

def split_data(X, y, frac: float = 0.2) -> tuple:
    """Splits data so that it returns a train / test split with a given fraction"""

    # cutoff point for training / test split
    idx_cutoff = int(X.shape[0] * (1 - frac))

    X_train, X_test, y_train, y_test = X.iloc[:idx_cutoff], X.iloc[idx_cutoff:], y.iloc[:idx_cutoff], y.iloc[idx_cutoff:]

    return X_train, X_test, y_train, y_test

In [7]:
df = df.drop(['Unnamed: 0.1','Unnamed: 0'], axis = 1 )
df

Unnamed: 0,listing__statPrice,CaseSize,allDiamond,preciousStone,someDiamonds,braceletRubber,braceletLeather,braceletFabric,braceletMetal,caseGold,...,Productairking,Productgmtmaster,Productyachtmaster,Productsubmariner,Productcosmographdaytona,Productseadweller,Productskydweller,Productexplorer,Productmilgauss,Productother
0,12224.64000,41.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,29339.13600,42.0,0,0,0,1,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
2,9412.97280,41.0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,12530.25600,41.0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,14879.21864,43.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,16204.42500,44.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
842,24134.25000,40.0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
843,19446.36750,40.0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
844,20687.62500,40.0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
col_list_train = list(df.columns)
col_list_train.remove('listing__statPrice')
# col_list_train.remove('id')


dfX_train = df[col_list_train]
dfy_train = df['listing__statPrice']

X_train, X_test, y_train, y_test = split_data(dfX_train, dfy_train, 0.2)

X_train.head()

Unnamed: 0,CaseSize,allDiamond,preciousStone,someDiamonds,braceletRubber,braceletLeather,braceletFabric,braceletMetal,caseGold,caseYellow,...,Productairking,Productgmtmaster,Productyachtmaster,Productsubmariner,Productcosmographdaytona,Productseadweller,Productskydweller,Productexplorer,Productmilgauss,Productother
0,41.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,42.0,0,0,0,1,0,0,1,1,1,...,0,0,1,0,0,0,0,0,0,0
2,41.0,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,41.0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,43.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
stdScale = StandardScaler()
stdScale.fit_transform(X_train)
stdScale.transform(X_test)

#Set the minimum error arbitrarily large
min = 99999999999999999999999 
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 1000
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count

    param = {} #initialize parameters
    param['learning_rate'] = np.random.uniform(0, 1)
    # param['num_iterations'] = np.random.randint(100,1000)
    param['boosting_type'] = np.random.choice(['gbdt'])
    # param['objective'] = 'binary'
    param['metric'] = 'mse'
    param['feature_fraction'] = np.random.uniform(0, 1)
    param['num_leaves'] = np.random.randint(5, 300)
    param['min_data_in_leaf'] = np.random.randint(5, 200)
    param['max_depth'] = np.random.randint(5, 300)
    param['early_stopping_round'] = 5
    iterations = np.random.randint(10, 10000)
    print(param, iterations)#Train using selected parameters
    lgbm = LGBMRegressor(**param)
    lgbm.fit(X_train,y_train, eval_set = [(X_test, y_test) , (X_train, y_train)], eval_metric = 'rmse')
    prediction = lgbm.predict(X_test)
    logloss = mean_squared_error(y_true = y_test, y_pred = prediction, squared = False)

    print('logloss:', logloss)
    if logloss < min:
        min = logloss
        pp = param

print("*" * 100)
print('Minimum is: ', min)
print('Used params', pp)

iteration number 0
{'learning_rate': 0.7920471197082649, 'boosting_type': 'gbdt', 'metric': 'mse', 'feature_fraction': 0.13914530127682934, 'num_leaves': 116, 'min_data_in_leaf': 85, 'max_depth': 230, 'early_stopping_round': 5} 7361
[1]	training's rmse: 17457.9	training's l2: 3.04778e+08	valid_0's rmse: 18178.1	valid_0's l2: 3.30443e+08
[2]	training's rmse: 17286.5	training's l2: 2.98821e+08	valid_0's rmse: 18174.6	valid_0's l2: 3.30315e+08
[3]	training's rmse: 17239.8	training's l2: 2.97211e+08	valid_0's rmse: 18282.3	valid_0's l2: 3.34241e+08
[4]	training's rmse: 17231.7	training's l2: 2.96932e+08	valid_0's rmse: 18248.2	valid_0's l2: 3.32998e+08
[5]	training's rmse: 17096.1	training's l2: 2.92277e+08	valid_0's rmse: 18319.1	valid_0's l2: 3.35588e+08
[6]	training's rmse: 16647.4	training's l2: 2.77137e+08	valid_0's rmse: 18214.3	valid_0's l2: 3.31761e+08
[7]	training's rmse: 16641	training's l2: 2.76922e+08	valid_0's rmse: 18181.6	valid_0's l2: 3.30569e+08
logloss: 18174.561719493176

KeyboardInterrupt: 

In [15]:
stdScale = StandardScaler()
stdScale.fit_transform(X_train)
stdScale.transform(X_test)

lgbm_train = LGBMRegressor(learning_rate = 0.12672753417697025, boosting_type = 'gbdt', metric = 'rmse', feature_fraction = 0.25975871059387023, num_leaves = 261, min_data_in_leaf = 11, max_depth = 137, early_stopping_round = 5)
lgbm_train.fit(X_train,y_train, eval_set = [(X_test, y_test) , (X_train, y_train)], eval_metric = 'rmse')
prediction = lgbm_train.predict(X_test)
print(mean_squared_error(y_true = y_test, y_pred = prediction, squared = False))
# lgb.plot_importance(lgbm)
# print('Training accuracy {:.4f}'.format(lgbm.score(X_train,y_train)))
# print('Testing accuracy {:.4f}'.format(lgbm.score(X_test,y_test)))
prediction

[1]	training's rmse: 19053.2	valid_0's rmse: 19209.1
[2]	training's rmse: 18628.9	valid_0's rmse: 18819.6
[3]	training's rmse: 18443.6	valid_0's rmse: 18744.8
[4]	training's rmse: 18314.4	valid_0's rmse: 18604.4
[5]	training's rmse: 17572.5	valid_0's rmse: 17880.9
[6]	training's rmse: 16735.2	valid_0's rmse: 17168.4
[7]	training's rmse: 16379.6	valid_0's rmse: 16883.2
[8]	training's rmse: 15832.4	valid_0's rmse: 16312.8
[9]	training's rmse: 15397.2	valid_0's rmse: 15908.3
[10]	training's rmse: 14933.1	valid_0's rmse: 15349.7
[11]	training's rmse: 14641.3	valid_0's rmse: 15158.8
[12]	training's rmse: 14439.5	valid_0's rmse: 14921
[13]	training's rmse: 14332.2	valid_0's rmse: 14838.4
[14]	training's rmse: 14173.6	valid_0's rmse: 14729.5
[15]	training's rmse: 14085.8	valid_0's rmse: 14659.5
[16]	training's rmse: 13945.2	valid_0's rmse: 14542.3
[17]	training's rmse: 13670.5	valid_0's rmse: 14161.1
[18]	training's rmse: 13533.6	valid_0's rmse: 14130.7
[19]	training's rmse: 13395.7	valid_0's

array([78808.51887303, 14675.1694352 , 11878.28407117, 15755.90122415,
       10119.39340988, 22555.09507796, 17394.14565545, 22275.30976402,
       11536.7117447 , 46046.06060578, 26137.83445732, 30317.38533026,
       56337.11612819, 23778.34227693, 15755.90122415, 20023.20662605,
       14235.15883304,  6135.55177319, 43553.19735014, 15070.79372562,
       30317.38533026, 10478.61358305, 13662.865009  , 51500.68772693,
       11906.57720815, 28425.09350402, 17182.97866719, 16193.2737767 ,
       10297.88257   , 44070.68339007, 14401.31999005, 44070.68339007,
       12771.77059372,  9926.03271328, 15513.00493411, 11160.41180384,
       13711.63318423, 15755.90122415, 24164.1769584 , 40173.84384539,
        8923.14251872, 41154.01706535, 14783.57388118, 14351.96280294,
       23073.02476474, 11312.94449821, 12771.77059372, 44070.68339007,
       14383.50611011, 46038.68732328, 14235.15883304, 11536.7117447 ,
       18165.46592811,  9926.03271328, 17182.97866719, 13016.7294932 ,
      

In [17]:
# Model for linear regression
mod_pipeline = Pipeline([('scaler', StandardScaler()), ('linearRegression', LinearRegression())])
mod_pipeline.fit(X_train, y_train)
test_score = mod_pipeline.score(X_test, y_test)
test_score

0.543308485096857

In [None]:
filename = 'watches_lgbm_model.pkl'
pickle.dump(lgbm, open(filename, 'wb'))

