In [20]:
# Build model for watch price utilizing linear regression and lightgbm
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import lightgbm as lgb
import pickle
from sklearn.pipeline import make_pipeline, Pipeline
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from datetime import date
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [21]:
file = 'data/dataWithCurrencyVer002.csv'
file2 = 'data/reformatedAndOneHotEncodedDataVer011.csv'

In [22]:
df = pd.read_csv(file, index_col=0)
## Re-ordering to ascending order
df = df.iloc[::-1]
df['listing__statPrice'] = df['listing__statPrice'].round(2)
df.drop(df[df['Age'] == '41mm'].index, inplace=True)
df.drop(df[df['Age'] == '2918'].index, inplace=True)
df['Age'] = df['Age'].str[:4].astype('int')
df['Age'] = date.today().year - df['Age']

In [None]:
df.replace(np.nan, 'Missing', inplace=True)
df['Age'] = date.today().year - df['Age'].astype('int')

In [None]:
df.head()

### Data Transformations

In [None]:
y = df['listing__statPrice']
X = df.drop(columns=['product-subtitle', 'Model', 'LOT'], axis=1)
# X = data.drop(['gross'], axis=1)

# Shuffle to false to handle time data
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=False, test_size = 0.2)

In [None]:
categorical_cols = ['language', 'country', 'rating']

In [None]:
# Categorical Preprocessor
categorical_preprocessor = Pipeline(
    steps=[
        # Change to 'ignore' if error raised
        ("OHE", OneHotEncoder(handle_unknown='error', drop='first'))
    ])

In [None]:
#Combine preprocessors
#Commenting out TfidfVectorizer as it does not help the model
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocesser, 'plot'),
        ('category', categorical_preprocessor, categorical_cols)
    ])

In [None]:
# change file path to data
fileName = 'data/reformatedAndOneHotEncodedDataVer011.csv'
df = pd.read_csv(fileName)
# df = df.iloc[::-1]

def split_data(X, y, frac: float = 0.2) -> tuple:
    """Splits data so that it returns a train / test split with a given fraction"""

    # cutoff point for training / test split
    idx_cutoff = int(X.shape[0] * (1 - frac))

    X_train, X_test, y_train, y_test = X.iloc[:idx_cutoff], X.iloc[idx_cutoff:], y.iloc[:idx_cutoff], y.iloc[idx_cutoff:]

    return X_train, X_test, y_train, y_test

In [None]:
df.reindex(index=df.index[::-1])

In [None]:
df.head()

In [None]:
col_list_train = list(df.columns)
col_list_train.remove('listing__statPrice')
# col_list_train.remove('id')


dfX_train = df[col_list_train]
dfy_train = df['listing__statPrice']

X_train, X_test, y_train, y_test = split_data(dfX_train, dfy_train, 0.2)

X_train.head()

In [None]:
col_list_train = list(df.columns)
col_list_train.remove('listing__statPrice')
# col_list_train.remove('id')


dfX_train = df[col_list_train]
dfy_train = df['listing__statPrice']

X_train, X_test, y_train, y_test = split_data(dfX_train, dfy_train, 0.2)

X_train.head()

In [None]:
stdScale = StandardScaler()
stdScale.fit_transform(X_train)
stdScale.transform(X_test)

#Set the minimum error arbitrarily large
min = 99999999999999999999999 
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 1000
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count

    param = {} #initialize parameters
    param['learning_rate'] = np.random.uniform(0, 1)
    # param['num_iterations'] = np.random.randint(100,1000)
    param['boosting_type'] = np.random.choice(['gbdt'])
    # param['objective'] = 'binary'
    param['metric'] = 'mse'
    param['feature_fraction'] = np.random.uniform(0, 1)
    param['num_leaves'] = np.random.randint(5, 300)
    param['min_data_in_leaf'] = np.random.randint(5, 200)
    param['max_depth'] = np.random.randint(5, 300)
    param['early_stopping_round'] = 5
    iterations = np.random.randint(10, 10000)
    print(param, iterations)#Train using selected parameters
    lgbm = LGBMRegressor(**param)
    lgbm.fit(X_train,y_train, eval_set = [(X_test, y_test) , (X_train, y_train)], eval_metric = 'rmse')
    prediction = lgbm.predict(X_test)
    logloss = mean_squared_error(y_true = y_test, y_pred = prediction, squared = False)

    print('logloss:', logloss)
    if logloss < min:
        min = logloss
        pp = param

print("*" * 100)
print('Minimum is: ', min)
print('Used params', pp)

In [None]:
stdScale = StandardScaler()
stdScale.fit_transform(X_train)
stdScale.transform(X_test)

lgbm_train = LGBMRegressor(learning_rate = 0.12672753417697025, boosting_type = 'gbdt', metric = 'rmse', feature_fraction = 0.25975871059387023, num_leaves = 261, min_data_in_leaf = 11, max_depth = 137, early_stopping_round = 5)
lgbm_train.fit(X_train,y_train, eval_set = [(X_test, y_test) , (X_train, y_train)], eval_metric = 'rmse')
prediction = lgbm_train.predict(X_test)
print(mean_squared_error(y_true = y_test, y_pred = prediction, squared = False))
# lgb.plot_importance(lgbm)
# print('Training accuracy {:.4f}'.format(lgbm.score(X_train,y_train)))
# print('Testing accuracy {:.4f}'.format(lgbm.score(X_test,y_test)))
prediction

In [None]:
# Model for linear regression
mod_pipeline = Pipeline([('scaler', StandardScaler()), ('linearRegression', LinearRegression())])
mod_pipeline.fit(X_train, y_train)
test_score = mod_pipeline.score(X_test, y_test)
test_score
## 0.543308485096857 with model training recent sales


## -5.813872016504329e+21 data reversed using onehotencode data
## 0.5415915991663093

In [None]:
lr_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

lasso_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Lasso()
)

ridge_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Ridge()
)

rf_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor(random_state = 42)
)

In [None]:
lr_pipe.fit(X_train, y_train)
train_score = lr_pipe.score(X_train, y_train)
val_score = lr_pipe.score(X_val, y_val)
test_score = lr_pipe.score(X_test, y_test)
val_pred = lr_pipe.predict(X_val)
test_pred = lr_pipe.predict(X_test)

print('Linear Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

In [None]:
filename = 'watches_lgbm_model.pkl'
pickle.dump(lgbm, open(filename, 'wb'))

