In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Load Data
tfidf = pd.read_csv('df_train_tfidf.csv')
category_train = pd.read_csv('df_train_category.csv')
tfidf_test = pd.read_csv('df_test_tfidf.csv')
category_test = pd.read_csv('df_test_category.csv')
w2v = pd.read_csv('df_train_w2v_150.csv')
w2v_test = pd.read_csv('df_test_w2v_150.csv')
bert = pd.read_json('train_small_embeddings.json')
bert_test = pd.read_json('test_embeddings_new.json')

In [None]:
# Special Treatment for Bert Data
bert_final = bert.drop(0)
bert_final.reset_index(inplace=True)
del bert_final['index']

In [None]:
# Form Train and Test Data
price_train = category_train.iloc[:,-1]
price_test = category_test.iloc[:,-1]
del category_train['price']
del category_train['Unnamed: 0']
del category_test['price']
del category_test['Unnamed: 0']
del w2v['Unnamed: 0']
del w2v_test['Unnamed: 0']
del tfidf['Unnamed: 0']
del tfidf_test['Unnamed: 0']

In [None]:
w2v_train_total = pd.concat([category_train,w2v], axis=1)
tfidf_train_total = pd.concat([category_train,tfidf], axis=1)
bert_train_total = pd.concat([category_train,bert_final], axis=1)
w2v_test_total = pd.concat([category_test,w2v_test], axis=1)
tfidf_test_total = pd.concat([category_test,tfidf_test], axis=1)
bert_test_total = pd.concat([category_test,bert_test], axis=1)

In [None]:
super_train = pd.concat([category_train, w2v, tfidf, bert_final], axis=1)
super_test = pd.concat([category_test, w2v_test, tfidf_test, bert_test], axis=1)

In [None]:
# Normalized Price
scaler = MinMaxScaler()
modified_price_train = pd.Series.to_numpy(price_train)
modified_price_train = modified_price_train.reshape(-1,1)
normalized_price_train = scaler.fit_transform(modified_price_train)

In [None]:
# Log Price
log_price_train = np.log(price_train)
positive_min = min(i for i in log_price_train if i > 0)
log_price_train[np.where(log_price_train < 0)[0]] = positive_min

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
def Linear(train_data, test_data, data_type):
    pipe = make_pipeline(LinearRegression())
    pipe.fit(train_data, price_train)
    predict = pipe.predict(test_data)
    positive_min = min(i for i in predict if i > 0)
    predict[np.where(predict < 0)[0]] = positive_min
    print("MSLE (Linear + " + data_type + "): ", metrics.mean_squared_log_error(price_test, predict))


Linear(category_train,category_test, 'category')
Linear(w2v_train_total, w2v_test_total, 'w2v')
Linear(tfidf_train_total, tfidf_test_total, 'tfidf')
Linear(bert_train_total, bert_test_total, 'bert')
Linear(super_train, super_test, 'super')

MSLE (Linear + category):  0.4396545188610682
MSLE (Linear + w2v):  0.5493669604531711
MSLE (Linear + tfidf):  0.5076700925010251




MSLE (Linear + bert):  0.5641962525805007




MSLE (Linear + super):  0.7351868262013119


In [None]:
def Linear_log(train_data, test_data, data_type):
    pipe = make_pipeline(LinearRegression())
    pipe.fit(train_data, log_price_train)
    predict = pipe.predict(test_data)
    predict_return = np.exp(predict)
    positive_min = min(i for i in predict_return if i > 0)
    predict_return[np.where(predict_return < 0)[0]] = positive_min
    print("MSLE (XGBoost + log Label + " + data_type + "):", metrics.mean_squared_log_error(price_test, predict_return))

Linear_log(category_train,category_test, 'category')
Linear_log(w2v_train_total, w2v_test_total, 'w2v')
Linear_log(tfidf_train_total, tfidf_test_total, 'tfidf')
Linear_log(bert_train_total, bert_test_total, 'bert')

MSLE (XGBoost + log Label + category): 0.3996333560367983
MSLE (XGBoost + log Label + w2v): 0.42270942857848504
MSLE (XGBoost + log Label + tfidf): 0.35645296532473925




MSLE (XGBoost + log Label + bert): 0.34709181450499055


In [None]:
def XG(train_data, test_data, data_type):
    pipe = make_pipeline(XGBRegressor(random_state = 42))
    pipe.fit(train_data, price_train)
    predict = pipe.predict(test_data)
    positive_min = min(i for i in predict if i > 0)
    predict[np.where(predict < 0)[0]] = positive_min
    print("MSLE (XGBoost + " + data_type + "):", metrics.mean_squared_log_error(price_test, predict))



XG(category_train,category_test, 'category')
XG(w2v_train_total, w2v_test_total, 'w2v')
XG(tfidf_train_total, tfidf_test_total, 'tfidf')
XG(bert_train_total, bert_test_total, 'bert')
XG(super_train, super_test, 'super')

MSLE (XGBoost + category): 0.35638876531754404
MSLE (XGBoost + w2v): 0.4483697674694636
MSLE (XGBoost + tfidf): 0.35287508452320837
MSLE (XGBoost + bert): 0.3737473963848344
MSLE (XGBoost + super): 0.39602586545172186


In [None]:
def XG_Normalized(train_data, test_data, data_type):
    pipe = make_pipeline(XGBRegressor(random_state = 42))
    pipe.fit(train_data, normalized_price_train)
    predict = pipe.predict(test_data)
    predict = predict.reshape(-1,1)
    predict_return = scaler.inverse_transform(predict)
    positive_min = min(i for i in predict_return if i > 0)
    predict_return[np.where(predict_return < 0)[0]] = positive_min
    print("MSLE (XGBoost + normalized Label + " + data_type + "):", metrics.mean_squared_log_error(price_test, predict_return))

XG_Normalized(category_train,category_test, 'category')
XG_Normalized(w2v_train_total, w2v_test_total, 'w2v')
XG_Normalized(tfidf_train_total, tfidf_test_total, 'tfidf')
XG_Normalized(bert_train_total, bert_test_total, 'bert')
XG_Normalized(super_train, super_test, 'super')

def XG_log(train_data, test_data, data_type):
    pipe = make_pipeline(XGBRegressor(random_state = 42))
    pipe.fit(train_data, log_price_train)
    predict = pipe.predict(test_data)
    predict_return = np.exp(predict)
    positive_min = min(i for i in predict_return if i > 0)
    predict_return[np.where(predict_return < 0)[0]] = positive_min
    print("MSLE (XGBoost + log Label + " + data_type + "):", metrics.mean_squared_log_error(price_test, predict_return))

XG_log(category_train,category_test, 'category')
XG_log(w2v_train_total, w2v_test_total, 'w2v')
XG_log(tfidf_train_total, tfidf_test_total, 'tfidf')
XG_log(bert_train_total, bert_test_total, 'bert')
XG_log(super_train, super_test, 'super')


MSLE (XGBoost + normalized Label + category): 0.3557016113412558
MSLE (XGBoost + normalized Label + w2v): 0.4469841751373628
MSLE (XGBoost + normalized Label + tfidf): 0.3536965761169831
MSLE (XGBoost + normalized Label + bert): 0.3770917192168319
MSLE (XGBoost + normalized Label + super): 0.41001295321384806
MSLE (XGBoost + log Label + category): 0.3087778117991437
MSLE (XGBoost + log Label + w2v): 0.36555334966865266
MSLE (XGBoost + log Label + tfidf): 0.29617681011086044
MSLE (XGBoost + log Label + bert): 0.30676940634788835
MSLE (XGBoost + log Label + super): 0.31098740549209364


In [None]:
def RandomForest(train_data, test_data, data_type):
    pipe = make_pipeline(RandomForestRegressor(random_state = 42, max_depth=5, n_jobs = -1))
    pipe.fit(train_data, price_train)
    predict = pipe.predict(test_data)
    positive_min = min(i for i in predict if i > 0)
    predict[np.where(predict < 0)[0]] = positive_min
    print("MSLE (Random Forest + " + data_type + "): ", metrics.mean_squared_log_error(price_test, predict))

RandomForest(category_train,category_test, 'category')
RandomForest(w2v_train_total, w2v_test_total, 'w2v')
RandomForest(tfidf_train_total, tfidf_test_total, 'tfidf')
RandomForest(bert_train_total, bert_test_total, 'bert')
RandomForest(super_train, super_test, 'super')

MSLE (Random Forest + category):  0.4136492437668892
MSLE (Random Forest + w2v):  0.41510469578984405
MSLE (Random Forest + tfidf):  0.41615259723602505




MSLE (Random Forest + bert):  0.41623465349406996




MSLE (Random Forest + super):  0.41860579041382767


In [None]:
def RandomForest_Normalized(train_data, test_data, data_type):
    pipe = make_pipeline(RandomForestRegressor(random_state = 42, max_depth=5, n_jobs = -1))
    pipe.fit(train_data, normalized_price_train)
    predict = pipe.predict(test_data)
    predict = predict.reshape(-1,1)
    predict_return = scaler.inverse_transform(predict)
    positive_min = min(i for i in predict_return if i > 0)
    predict_return[np.where(predict_return < 0)[0]] = positive_min
    print("MSLE (Random Forest + normalized Label + " + data_type + "): ", metrics.mean_squared_log_error(price_test, predict_return))

RandomForest_Normalized(category_train,category_test, 'category')
RandomForest_Normalized(w2v_train_total, w2v_test_total, 'w2v')
RandomForest_Normalized(tfidf_train_total, tfidf_test_total, 'tfidf')
RandomForest_Normalized(bert_train_total, bert_test_total, 'bert')
RandomForest_Normalized(super_train, super_test, 'super')

def RandomForest_log(train_data, test_data, data_type):
    pipe = make_pipeline(RandomForestRegressor(random_state = 42,  max_depth=5, n_jobs = -1))
    pipe.fit(train_data, log_price_train)
    predict = pipe.predict(test_data)
    predict_return = np.exp(predict)
    positive_min = min(i for i in predict_return if i > 0)
    predict_return[np.where(predict_return < 0)[0]] = positive_min
    print("MSLE (Random Forest + log Label + " + data_type + "):", metrics.mean_squared_log_error(price_test, predict_return))

RandomForest_log(category_train,category_test, 'category')
RandomForest_log(w2v_train_total, w2v_test_total, 'w2v')
RandomForest_log(tfidf_train_total, tfidf_test_total, 'tfidf')
RandomForest_log(bert_train_total, bert_test_total, 'bert')
RandomForest_log(super_train, super_test, 'super')

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MSLE (Random Forest + normalized Label + category):  0.41365021082300985


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MSLE (Random Forest + normalized Label + w2v):  0.4151313378661671


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MSLE (Random Forest + normalized Label + tfidf):  0.41614798016055854


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MSLE (Random Forest + normalized Label + bert):  0.41627010865773556


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MSLE (Random Forest + normalized Label + super):  0.41857295757882973
MSLE (Random Forest + log Label + category): 0.35572829207366435
MSLE (Random Forest + log Label + w2v): 0.35613996408353543
MSLE (Random Forest + log Label + tfidf): 0.35511534596717326




MSLE (Random Forest + log Label + bert): 0.35560370806980995




MSLE (Random Forest + log Label + super): 0.3551755422429357
