# Use data from the previous stages (Refer to file 001 and 002)
### Using two algorithms to make the predictions

In [1]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
# Suppress the warnings
warnings.filterwarnings('ignore')

In [4]:
# Read Data from stage 2
data = pd.read_csv("./data_pre.csv",index_col=0)

In [5]:
data.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,2017-10-16,9549826748224805897,1508200705,4,1508200705,0,0,False,0,0,0,0,0,0,0,0,9,0.0,9,13.0,261.0,17190000,15190000,1,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
1,1,2017-10-16,3700714855829972615,1508192636,11,1508192636,0,0,False,1,1,0,0,1,0,1,0,15,0.0,12,38.0,285.0,13000000,8000000,1,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1
2,0,2017-10-16,1572225825161580042,1508162218,6,1508162218,1,1,True,2,2,0,0,2,1,1,0,15,0.0,15,42.0,1044.0,65300000,57300000,1,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
3,1,2017-10-16,7187192533100162289,1508189401,17,1508189401,0,0,False,0,2,0,0,2,0,1,0,18,0.0,16,77.0,514.0,25230000,18230000,1,0,0,0,0,0.0,0,0,0.0,0,1,1,1,1
4,1,2017-10-16,7889233516776348524,1508190484,1,1508190484,0,0,False,0,0,0,0,0,0,0,0,21,1.0,20,62.0,487.0,16310000,12310000,1,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1


In [6]:
# Actual revenue is given by the formula (log(transactionRevenue+1))
data['logRevenue'] = np.log(data['totals.transactionRevenue']+1)

In [7]:
data['logRevenue'].describe()

count    18514.000000
mean        17.770575
std          1.186022
min          9.210440
25%         16.953935
50%         17.645455
75%         18.420681
max         23.864375
Name: logRevenue, dtype: float64

In [8]:
X = data.copy()
X.drop(['totals.transactionRevenue','date','logRevenue','totals.totalTransactionRevenue','fullVisitorId','visitId','visitStartTime'],axis=1,inplace=True)
y = data['logRevenue']

In [9]:
X.head()

Unnamed: 0,channelGrouping,visitNumber,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,4,0,0,False,0,0,0,0,0,0,0,0,9,0.0,9,13.0,261.0,1,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
1,1,11,0,0,False,1,1,0,0,1,0,1,0,15,0.0,12,38.0,285.0,1,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1
2,0,6,1,1,True,2,2,0,0,2,1,1,0,15,0.0,15,42.0,1044.0,1,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
3,1,17,0,0,False,0,2,0,0,2,0,1,0,18,0.0,16,77.0,514.0,1,0,0,0,0,0.0,0,0,0.0,0,1,1,1,1
4,1,1,0,0,False,0,0,0,0,0,0,0,0,21,1.0,20,62.0,487.0,1,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

# Using Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs=-1)

In [None]:
#set paramaters for random forest
param = {'max_depth':[5,10,20],
        'n_estimators' : [20,50,100,200]}

In [None]:
#fit model to the training data 
rf_cv = GridSearchCV(rf,param,cv=10,verbose=True,scoring='neg_mean_squared_error',n_jobs=-1)

rf_cv.fit(X_train, y_train)

In [None]:
rf_cv.cv_results_

In [None]:
# determining the best model with best_estimator
best_model = rf_cv.best_estimator_
print(best_model)

In [None]:
# displaying the predictive score of the model on the training data
predicted_train = best_model.predict(X_train)
best_model.score(X_train,y_train)

In [None]:
rmse_train = np.sqrt(mean_squared_error(y_train, predicted_train))
print("RMSE: %f" % (rmse_train))

In [None]:
predicted_test = best_model.predict(X_test)

In [None]:
rmse_test = np.sqrt(mean_squared_error(y_test, predicted_test))
print("RMSE: %f" % (rmse_test))

In [None]:
predicted_vs_actual = pd.DataFrame(y_test)
predicted_vs_actual.columns = ['Actual']
predicted_vs_actual['Predicted'] = np.array(predicted_test)
predicted_vs_actual.head()


In [None]:
predicted_vs_actual_plot = predicted_vs_actual.sort_index(ascending=1).head(50)

In [None]:
plt.plot(predicted_vs_actual_plot['Actual'],color='r',marker='o',label='Actual')
plt.plot(predicted_vs_actual_plot['Predicted'],color='b',marker='x',label='Predicted')
plt.legend(loc="upper left")

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.savefig("Actual_vs_Predicted.png")

In [None]:
feature_importances = pd.DataFrame(best_model.feature_importances_,index = X_train.columns,columns=['importance']).sort_values('importance',ascending=True)
feature_importances_plot = feature_importances.tail(5)

In [None]:
feature_importances_plot.plot(kind='barh', title ="V comp",figsize=(10,5),legend=True, fontsize=12, color='red',sort_columns=False)
plt.title("Top 5 Features by Weight")
plt.savefig("Top 5 Features.png")

# Use XGBoost

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
xg_reg = XGBRegressor(objective ='reg:linear',colsample_bytree = 0.3,learning_rate = 0.1)

In [None]:
param = {'n_estimators' : [20,50,100],        
         'max_depth':[5,10,15,20]
        }

In [None]:
xgb_cv = GridSearchCV(xg_reg,param,cv=10,verbose=True,scoring='neg_mean_squared_error',n_jobs=-1)

xgb_cv.fit(X_train, y_train)

In [None]:
preds = xgb_cv.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
xg_reg = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10,alpha=10,n_estimators = 100)

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))