In [19]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.utils import check_random_state
from sklearn import metrics
from sklearn.cross_validation import train_test_split


import pandas as pd
import numpy as np



In [23]:
# try again with log sales rank as the target variable
data = pd.read_csv('./data/final_regression_dataset.csv')
# remove asins
data = data.drop('ASIN',axis=1)
# one hot encoding for any categorical vars
data = pd.get_dummies(data)
y = data.loc[:, data.columns == 'salesRank']
# y = y["salesRank"].tolist() 
# # apply log to the sales rank column
# y = y.apply(np.log)
# keep rest of the variables as it is
x = data.loc[:, data.columns != 'salesRank']
# split 75% for training and 25% for testing
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
y_train = y_train["salesRank"].tolist() 
y_test = y_test["salesRank"].tolist()

(15179, 404) (15179, 1) (5060, 404) (5060, 1)


In [25]:
base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

# make predictions using DecisionTreeRegressor
y_pred = base_estimator.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RMSE in terms of sales rank = ",rmse, "R2 = ", base_estimator.score(X_train,y_train))

RMSE in terms of sales rank =  78442.4170835 R2 =  1.0


In [28]:
# Next try ensembling
rng = check_random_state(0)

# without bootstrap, all trees are perfect on the training set
model = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                            max_samples=1.0,
                            bootstrap=False,
                            random_state=rng).fit(X_train, y_train)
# make predictions using Bagging Regressor
y_pred = model.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RMSE in terms of log(sales rank) = ",rmse, "R2 = ", model.score(X_train,y_train))

RMSE in terms of log(sales rank) =  73141.7753639 R2 =  1.0


In [31]:
# ensemble with bootstraping and more estimators
model = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                            max_samples=1.0,
                            bootstrap=True,
                            random_state=rng,
                            n_estimators=250
                        ).fit(X_train, y_train)
# make predictions using Bagging Regressor
y_pred = model.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RMSE in terms of log(sales rank) = ",rmse, "R2 = ", model.score(X_train,y_train))

RMSE in terms of log(sales rank) =  56886.0274835 R2 =  0.860026387242


In [None]:
# try random forests