## Google Analytics - Customer Revenue Prediction

###### Note: Visualization for the data done in a separate notebook

## 1. Getting the data

In [1]:
#### Loading the Required Packages ############
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys

from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

#### Function to convert XML to dataframes 

In [2]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [str(column)+"."+str(subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

#### Calling the input train and test xml files to convert to dataframe

In [3]:
train = load_df(os.getcwd()+'\\train.csv')
test = load_df(os.getcwd()+'\\test.csv')

#### Saving a CSV copy of generated dataframes

In [218]:
#train.to_csv('train_extracted.csv',encoding='utf8')
#test.to_csv('test_extracted.csv', encoding = 'utf8')

## Reading saved the data
#train = pd.read_csv('train_extracted.csv',encoding='utf8',dtype={'fullVisitorId': 'str'})
#test = pd.read_csv('test_extracted.csv', encoding = 'utf8',dtype={'fullVisitorId': 'str'})

#### Combining train and test data

For ease in computation of data transformation and other changes, we combine the train and test

In [5]:
train['data_source'] = 'train'
test['data_source'] = 'test'

all_data = pd.concat([train,test], axis = 0)

In [6]:
all_data.shape

(1708337, 56)

## 2. Analyzing the Data

#### Emtpy values 

Checking the ratio of NULL/empty values

In [7]:
100 * float(train['totals_transactionRevenue'].isnull().sum())/len(train['totals_transactionRevenue'])

98.72572768529513

#### Data Snapshot


In [10]:
#all_data.drop([u'Unnamed: 0'], axis = 1, inplace = True)
all_data.head()

Unnamed: 0,channelGrouping,data_source,date,device_browser,device_browserSize,device_browserVersion,device_deviceCategory,device_flashVersion,device_isMobile,device_language,...,trafficSource_campaign,trafficSource_campaignCode,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,visitId,visitNumber,visitStartTime
0,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472830385,1,1472830385
1,Organic Search,train,20160902,Firefox,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472880147,1,1472880147
2,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472865386,1,1472865386
3,Organic Search,train,20160902,UC Browser,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,google + online,organic,,google,1472881213,1,1472881213
4,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,...,(not set),,True,(not provided),organic,,google,1472822600,2,1472822600


In [11]:
#### Distribution of the 'visitNumber' variable
all_data.visitNumber.describe()

count    1.708337e+06
mean     2.335170e+00
std      9.354034e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.570000e+02
Name: visitNumber, dtype: float64

#### Response variable

In [12]:
all_data['totals_transactionRevenue'] = pd.to_numeric(all_data['totals_transactionRevenue'])
all_data['totals_transactionRevenue'].describe()

count    1.151500e+04
mean     1.337448e+08
std      4.482852e+08
min      1.000000e+04
25%      2.493000e+07
50%      4.945000e+07
75%      1.076550e+08
max      2.312950e+10
Name: totals_transactionRevenue, dtype: float64

#### Distribution of other columns apart from Response variable

In [13]:
for each in all_data.columns:
    print(each)
    print(all_data[each].value_counts().head())
    print("\n")

channelGrouping
Organic Search    738963
Social            354971
Direct            273134
Referral          211307
Display            51283
Name: channelGrouping, dtype: int64


data_source
train    903653
test     804684
Name: data_source, dtype: int64


date
20171212    9234
20171213    9131
20171004    5122
20170920    4880
20161128    4807
Name: date, dtype: int64


device_browser
Chrome               1173056
Safari                312165
Firefox                63845
Internet Explorer      35474
Android Webview        34266
Name: device_browser, dtype: int64


device_browserSize
not available in demo dataset    1708337
Name: device_browserSize, dtype: int64


device_browserVersion
not available in demo dataset    1708337
Name: device_browserVersion, dtype: int64


device_deviceCategory
desktop    1171579
mobile      471336
tablet       65422
Name: device_deviceCategory, dtype: int64


device_flashVersion
not available in demo dataset    1708337
Name: device_flashVersion, dtype: int

1513124981    28
1513125098    28
1513124949    26
1513125008    24
1513124997    24
Name: visitId, dtype: int64


visitNumber
1    1307430
2     182542
3      70962
4      37886
5      23314
Name: visitNumber, dtype: int64


visitStartTime
1513124981    28
1513125098    28
1513124949    26
1513124997    24
1513125008    24
Name: visitStartTime, dtype: int64




## 3. Data Pre-processing

#### Finding columns with just one value in it

Finding the columns with  <br>
i) If the number of unique values is just one <br>
ii) More than 50% of NA values <br><br>

Dropping such columns



In [14]:
### Selecting columns with just one unique value and those with NULL values greater than 50% of the dataset
drop_columns = [each for each in all_data.columns if len(train[each].unique()) <= 2 or all_data[each].isnull().sum() > 0.5 * all_data.shape[0]]
drop_columns = [each for each in drop_columns if each not in ['totals_transactionRevenue','data_source']]

In [15]:
all_data_upd = all_data.drop(drop_columns, axis = 1)

#### Feature Engineering: Creating date variables

In [16]:
all_data_upd['year'] = all_data_upd.date.apply(lambda x: int(str(x)[0:4]))
all_data_upd['month'] = all_data_upd.date.apply(lambda x: int(str(x)[4:6]))
all_data_upd['day'] = all_data_upd.date.apply(lambda x: int(str(x)[6:8]))

#### Cleaning and imputing train and test separately - since we do not want to use the test data to impute the training data

In [17]:
train_upd = all_data_upd[all_data_upd.data_source == 'train']
test_upd = all_data_upd[all_data_upd.data_source == 'test']

print(train_upd.shape)
print(test_upd.shape)

(903653, 27)
(804684, 27)


Viewing Training dataset

In [18]:
train_upd.head()

Unnamed: 0,channelGrouping,data_source,date,device_browser,device_deviceCategory,device_operatingSystem,fullVisitorId,geoNetwork_city,geoNetwork_continent,geoNetwork_country,...,totals_transactionRevenue,trafficSource_campaign,trafficSource_medium,trafficSource_source,visitId,visitNumber,visitStartTime,year,month,day
0,Organic Search,train,20160902,Chrome,desktop,Windows,1131660440785968503,Izmir,Asia,Turkey,...,,(not set),organic,google,1472830385,1,1472830385,2016,9,2
1,Organic Search,train,20160902,Firefox,desktop,Macintosh,377306020877927890,not available in demo dataset,Oceania,Australia,...,,(not set),organic,google,1472880147,1,1472880147,2016,9,2
2,Organic Search,train,20160902,Chrome,desktop,Windows,3895546263509774583,Madrid,Europe,Spain,...,,(not set),organic,google,1472865386,1,1472865386,2016,9,2
3,Organic Search,train,20160902,UC Browser,desktop,Linux,4763447161404445595,not available in demo dataset,Asia,Indonesia,...,,(not set),organic,google,1472881213,1,1472881213,2016,9,2
4,Organic Search,train,20160902,Chrome,mobile,Android,27294437909732085,not available in demo dataset,Europe,United Kingdom,...,,(not set),organic,google,1472822600,2,1472822600,2016,9,2


#### Imputation for Trainining dataset

Imputation has been done for Categorical and Numeric variables <br>
1. Numerical:<br>
    The NAs has been replaced with the mean values<br>
    
2. Categorical:<br>
    Categorical values with frequency less than 10 has been identified and grouped into a new category called 'Others'

In [19]:
## Cleaning the character columns in the data
float_cols = []


###Looping through each of the columns in the data 
for each in train_upd.columns:
    
    ### Skipping the column 'data_source'
    if(each == 'data_source'):
        continue
    
    if(train_upd[each].dtype == 'O' and len(train_upd[each].unique()) < 500000):
        print(each)
        ### Skipping the values are actually numeric or float
        
        
        ### CONDITION FOR NUMBERIC/FLOAT VALUES ######
        if( not (len(pd.to_numeric(train_upd[each],errors = 'coerce').unique()) < 0.5 * len(train_upd[each].unique())) ):
            
            train_upd.loc[:,each] = train_upd[each].fillna(train_upd[each].mean())
            float_cols.append(each)
        
        
        ### CONDITION FOR CHARACTER COLUMNS ######    
        else:


            print(len(train_upd[each].unique()))
            train_upd.loc[:,each] = train_upd[each].apply(lambda x: x.lower().strip())
            extreme_values = train_upd[each].value_counts()[train_upd[each].value_counts() < 10].index

            ## Replacing the outliers (extreme) values with 'Others'

            train_upd.loc[:,each] = train_upd[each].apply(lambda x: np.where(x in extreme_values, 'Others',x))

channelGrouping
8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


device_browser
54
device_deviceCategory
3
device_operatingSystem
20
geoNetwork_city
649
geoNetwork_continent
6
geoNetwork_country
222
geoNetwork_metro
94
geoNetwork_networkDomain
28064
geoNetwork_region
376
geoNetwork_subContinent
23
trafficSource_campaign
10
trafficSource_medium
7
trafficSource_source
380


#### Imputation for Test dataset

In [22]:
## Cleaning the character columns in the test data
## Similar method like train dataset is followed

for each in test_upd.columns:
    
    
    if(each == 'data_source'):
        continue
    
    if(test_upd[each].dtype == 'O' and len(test_upd[each].unique()) < 500000):
        print(each)    
        
        ### Identifying the columns as numeric/float as per the training data
        if(each in float_cols):
            
            test_upd[each] = test_upd[each].fillna(test_upd[each].mean())
        
        ### CONDITION FOR CHARACTER COLUMNS ######    
        else:
            print(len(test_upd[each].unique()))
            test_upd[each] = test_upd[each].apply(lambda x: x.lower().strip())
            extreme_values = test_upd[each].value_counts()[test_upd[each].value_counts() < 10].index

            ## Replacing the outliers (extreme) values with 'Others'

            test_upd[each] = test_upd[each].apply(lambda x: np.where(x in extreme_values, 'Others',x))

channelGrouping
8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


device_browser
109
device_deviceCategory
3
device_operatingSystem
22
geoNetwork_city
732
geoNetwork_continent
6
geoNetwork_country
219
geoNetwork_metro
109
geoNetwork_networkDomain
25750
geoNetwork_region
376
geoNetwork_subContinent
23
trafficSource_campaign
31
trafficSource_medium
7
trafficSource_source
324


#### Combining the data again

In [24]:
all_data_upd = pd.concat([train_upd,test_upd], axis = 0)

#### Creating Label Encodings for certain columns

In [25]:
### Getting encoder function for python
### Columns for transforming
## device_browser','geoNetwork_metro','geoNetwork_networkDomain','geoNetwork_region',
## 'trafficSource_source','geoNetwork_city','geoNetwork_country'


from sklearn.preprocessing import LabelEncoder
###################################################
device_browser = LabelEncoder()
device_browser.fit(all_data_upd.device_browser)

all_data_upd['device_browser_enc'] = device_browser.transform(all_data_upd.device_browser)
###############################################
geoNetwork_metro = LabelEncoder()
geoNetwork_metro.fit(all_data_upd.geoNetwork_metro)

all_data_upd['geoNetwork_metro_enc'] = geoNetwork_metro.transform(all_data_upd.geoNetwork_metro)
###############################################
geoNetwork_networkDomain = LabelEncoder()
geoNetwork_networkDomain.fit(all_data_upd.geoNetwork_networkDomain)

all_data_upd['geoNetwork_networkDomain_enc'] = geoNetwork_networkDomain.transform(all_data_upd.geoNetwork_networkDomain)
###############################################
geoNetwork_region = LabelEncoder()
geoNetwork_region.fit(all_data_upd.geoNetwork_region)

all_data_upd['geoNetwork_region_enc'] = geoNetwork_region.transform(all_data_upd.geoNetwork_region)
###############################################
trafficSource_source = LabelEncoder()
trafficSource_source.fit(all_data_upd.trafficSource_source)

all_data_upd['trafficSource_source_enc'] = trafficSource_source.transform(all_data_upd.trafficSource_source)
###############################################
geoNetwork_city = LabelEncoder()
geoNetwork_city.fit(all_data_upd.geoNetwork_city)

all_data_upd['geoNetwork_city_enc'] = geoNetwork_city.transform(all_data_upd.geoNetwork_city)
###############################################
geoNetwork_country = LabelEncoder()
geoNetwork_country.fit(all_data_upd.geoNetwork_country)

all_data_upd['geoNetwork_country_enc'] = geoNetwork_country.transform(all_data_upd.geoNetwork_country)
##############################################

In [26]:
## Columns that has more than ~200 distinct objects will create multiple columns while creating one hot vectors
### So, instead of dropping them, we encode those variables into numerical values

## Dropping such columns
all_data_upd = all_data_upd.drop(['device_browser','geoNetwork_metro','geoNetwork_networkDomain','geoNetwork_region',
                                  'trafficSource_source','geoNetwork_city','geoNetwork_country'], axis = 1)

## Getting the datashape
all_data_upd.shape

(1708337, 27)

In [27]:
all_data_upd['totals_transactionRevenue'] = all_data_upd['totals_transactionRevenue'].fillna(0)
all_data_upd = all_data_upd.fillna(0)

In [28]:
### Removing id columns
id_cols = ['sessionId','fullVisitorId','visitId','date']
all_data_dummies = pd.get_dummies(all_data_upd.drop(id_cols, axis = 1), dummy_na = True)

In [29]:
all_data_dummies.shape

(1708337, 121)

#### Saving the transformed train data

In [417]:
#all_data_dummies.to_csv('all_data_dummies.csv', index = False, encoding = 'utf8')
all_data_dummies = pd.read_csv("all_data_dummies.csv", encoding = 'utf8')

In [30]:
all_data_dummies.shape

(1708337, 121)

#### Dividing the data into train and test

In [31]:
train_dummies = all_data_dummies[all_data_dummies.data_source_train == 1]
test_dummies = all_data_dummies[all_data_dummies.data_source_test == 1]

# train_dummies = all_data_upd[all_data_upd.data_source == 'train']
# test_dummies = all_data_upd[all_data_upd.data_source == 'test']

### Creating the dataset that is model-ready

In [32]:
train_dummies = train_dummies.drop(['data_source_train','data_source_test','data_source_nan'], axis = 1)
test_dummies = test_dummies.drop(['data_source_train','data_source_test','data_source_nan','totals_transactionRevenue'], axis = 1)

#train_dummies = train_dummies.drop(['data_source'], axis = 1)
#test_dummies = test_dummies.drop(['data_source','totals.transactionRevenue'], axis = 1)

#### Creating Validation Dataset (Holdout data)

Note: The response variable is changed to log values of the totals_transactionRevenue for better accuracy. Normal transaction was compared agains log values and log values resulted in better scores

In [33]:
X_train, X_test, y_train, y_test = train_test_split(train_dummies.drop(['totals_transactionRevenue'], axis = 1), \
                  np.log(train_dummies['totals_transactionRevenue']+1),test_size = 0.2, random_state = 9)

In [34]:
X_train.head()

Unnamed: 0,totals_hits,totals_pageviews,visitNumber,visitStartTime,year,month,day,device_browser_enc,geoNetwork_metro_enc,geoNetwork_networkDomain_enc,...,trafficSource_campaign_value shoppers affinity,trafficSource_campaign_nan,trafficSource_medium_(none),trafficSource_medium_(not set),trafficSource_medium_affiliate,trafficSource_medium_cpc,trafficSource_medium_cpm,trafficSource_medium_organic,trafficSource_medium_referral,trafficSource_medium_nan
40485,4,4.0,1,1498884588,2017,6,30,23,72,1228,...,0,0,0,0,0,0,0,0,1,0
784765,1,1.0,1,1501200752,2017,7,27,23,72,0,...,0,0,0,0,0,0,0,1,0,0
103499,1,1.0,1,1499169217,2017,7,4,8,72,3414,...,0,0,0,0,0,0,0,1,0,0
363033,1,1.0,4,1482550584,2016,12,23,5,90,272,...,0,0,1,0,0,0,0,0,0,0
886064,3,2.0,5,1500310627,2017,7,17,5,90,0,...,0,0,0,0,0,0,0,1,0,0


In [35]:
print(X_train.shape)
print(y_train.shape)

(722922, 117)
(722922,)


In [36]:
#### Response variable for logistic regression
y_train_logistic = np.where(y_train != 0,1,0)

## 4. Model Building

There will be two types of models<br><br>
1) Logistic Regression to predict where we have to predict if the value is zero or non-zero <br>
2) Linear Regression to predict the actual Revenue <br>
    &emsp; 2.1) OLS <br>
    &emsp; 2.2) Random forest <br>
    &emsp; 2.3) Splines <br>
3) Predicting the actual Test data

### 4.1 Logistic Regression

 Model to find where we have 0 revenue and 1 (non-zero) revenue

In [37]:
### Model building - logistic regression
log_model = LogisticRegression(C = 0.1,solver = 'newton-cg')
log_model.fit(X_train,y_train_logistic)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

##### Model Prediction - train and test

In [38]:
#### Predicting the values - Obtaining prediction probability to manually specify a threshold value

threshold = 0.02
pred_vals = log_model.predict_proba(X_train)
pred_vals_test = log_model.predict_proba(X_test)

##Getting prediction 
pred_outs_train = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals)))
pred_outs_test = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals_test)))

### 4.2 Regression Models
### 4.2.1 OLS Models

In [39]:
lr_model = LinearRegression(n_jobs = -1)

In [40]:
lr_model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [41]:
predict = np.exp(lr_model.predict(X_train))
predict_test = np.exp(lr_model.predict(X_test))


##### Making the predictions zeros where-ever logistic regression predicted zero

predict[pred_outs_train == 0] = 0
predict_test[pred_outs_test == 0] = 0

#### Calculating prediction Error

#### Training data - LMSE

In [43]:
lmse = np.sum(np.log((np.exp(y_train.values) -  predict) ** 2 + 1))/len(y_train.values)
lmse

1.3217055363833488

#### Validation data - LMSE

In [44]:
lmse = np.sum(np.log((np.exp(y_test.values) - predict_test)**2 + 1))/len(y_test.values)
lmse

1.3236877386718608

### 4.2.2 Random forest

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
#### The arguments of Random Forest were changed constantly based on the error on validation dataset
#### 15 was found to be better (After which the values saturated)

rf_regression = RandomForestRegressor(n_estimators= 15)

### Fitting the random forest model
rf_regression.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [48]:
#### Predictions for the train and validation dataste

#Prediction for training data
rf_predictions = np.exp(rf_regression.predict(X_train))

#Prediction for validation data
rf_predictions_test = np.exp(rf_regression.predict(X_test))

In [49]:
### Using the predictions from logistic regression - to make the values zeros of those predicted as zeros from logistic regression
rf_predictions[pred_outs_train == 0] = 0
rf_predictions_test[pred_outs_test == 0] = 0


lmse = np.sum(np.log((np.exp(y_train.values) -  rf_predictions) ** 2 + 1))/len(y_train.values)
print("Train Error: ",lmse)

lmse = np.sum(np.log((np.exp(y_test.values) -  rf_predictions_test) ** 2 + 1))/len(y_train.values)
print("Validation Error: ",lmse)

#### The validation error dropped sig

Train Error:  1.1763764427402958
Validation Error:  0.342087113016737


#### Selecting top features based on the feature_importances_ from random forest results

In [124]:
#rf_regression.feature_importances_

In [194]:
#### Top 14 columns based on the threshold 0.01
rf_top_feats = X_train.columns[rf_regression.feature_importances_ > 0.01]
rf_top_feats

Index(['totals_hits', 'totals_pageviews', 'visitNumber', 'visitStartTime',
       'month', 'day', 'device_browser_enc', 'geoNetwork_metro_enc',
       'geoNetwork_networkDomain_enc', 'geoNetwork_region_enc',
       'trafficSource_source_enc', 'geoNetwork_city_enc',
       'geoNetwork_country_enc', 'device_operatingSystem_macintosh'],
      dtype='object')

### 4.2.3 Splines

In [195]:
from patsy import dmatrix
import statsmodels.api as sm

In [201]:
# Specifying 3 knots
X_train_df = pd.DataFrame()
X_val_df =  pd.DataFrame()
X_test_df =  pd.DataFrame()

for each in rf_top_feats:
    
    #### Removing Visit Start Time 
    if(each == "visitStartTime"):
        continue
    
    #print(each)
    k_val = X_train[each].quantile((0.25,0.5,0.75)).values.tolist()
    
    ################ Creating Spline variables for training data ######################
    transformed_x1 = dmatrix("bs(X_train[each], knots="+str(k_val)+", degree=1, include_intercept=False)",
                            {"X_train[each]": X_train[each]}, return_type='dataframe')
    
    X_train_df = pd.concat([X_train_df,transformed_x1],axis = 1)
    
    
    ################ Creating Spline variables for validation data ######################
    
    transformed_x2 = dmatrix("bs(X_test[each], knots="+str(k_val)+", degree=1, include_intercept=False)",
                            {"X_test[each]": X_test[each]}, return_type='dataframe')
    
    X_val_df = pd.concat([X_val_df,transformed_x2],axis = 1)
    
    
    ################ Creating Spline variables for test data ######################
    
    transformed_x3 = dmatrix("bs(test_dummies[each], knots="+str(k_val)+", degree=1, include_intercept=False)",
                            {"test_dummies[each]": test_dummies[each]}, return_type='dataframe')
    
    X_test_df = pd.concat([X_test_df,transformed_x3],axis = 1)

totals_hits
totals_pageviews
visitNumber
month
day
device_browser_enc
geoNetwork_metro_enc
geoNetwork_networkDomain_enc
geoNetwork_region_enc
trafficSource_source_enc
geoNetwork_city_enc
geoNetwork_country_enc
device_operatingSystem_macintosh


In [202]:
# Build a regular linear model from the splines
spline_model = sm.GLM(y_train, X_train_df).fit()

In [203]:
#### Predictions for the train and validation dataset

#Prediction for training data
spline_pred_train = np.exp(spline_model.predict(X_train_df))
spline_pred_val = np.exp(spline_model.predict(X_val_df))


##### Making the predictions zeros where-ever logistic regression predicted zero
spline_pred_train[pred_outs_train == 0] = 0
spline_pred_val[pred_outs_test == 0] = 0

#### Calculating prediction Error

#### Training data - LMSE

In [204]:
lmse = np.sum(np.log((np.exp(y_train.values) -  spline_pred_train) ** 2 + 1))/len(y_train.values)
lmse

1.3071476301698348

#### Validation data - LMSE

In [205]:
lmse = np.sum(np.log((np.exp(y_test.values) - spline_pred_val)**2 + 1))/len(y_test.values)
lmse

1.312285480125481

### Model Results - Inference:

Based on the train and validation results for Linear Regression, Random Forest and Regression Splines we see the performance of Random Forest to be significantly better than the other two. So, it could be inferred that the decision boundary is non-linear as obtained in the Random Forest

### 4.3 Predicting the actual test class<br>

Step 1: Run the logistic Regression Model <br>
Step 2: Run the Regression Model and make non-zero predictions only for those which had non-zero prediction o/p from logistic regression

#### Step1. Logistic Prediction

In [207]:
pred_vals_actual_test = log_model.predict_proba(test_dummies)

##Getting prediction 
pred_outs_actual_test = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals_actual_test)))

#### Step2. Regression Prediction

In [209]:
#### LINEAR REGRESSION PREDICTION ###
predict_actual_test_lr = lr_model.predict(test_dummies)
predict_actual_test_lr[pred_outs_actual_test == 0] = 0

#### RANDOM FOREST PREDICTION ###
predict_actual_test_rf = rf_regression.predict(test_dummies)
predict_actual_test_rf[pred_outs_actual_test == 0] = 0

#### REGRESSION SPLINES PREDICTION ###
predict_actual_test_sp = spline_model.predict(X_test_df)
predict_actual_test_sp[pred_outs_actual_test == 0] = 0

#### Preparing the submission files for Kaggle

##### Linear Regression Submission

In [217]:
submission = pd.concat([test.fullVisitorId,pd.DataFrame(predict_actual_test_lr)], axis = 1)
submission.columns.values[1]= 'PredictedLogRevenue'

submission_out = submission.groupby('fullVisitorId',as_index = False).sum()
#submission_out.to_csv("kaggle_submission_lr.csv", index = False)

##### Random Forest Submission

In [216]:
submission = pd.concat([test.fullVisitorId,pd.DataFrame(predict_actual_test_rf)], axis = 1)
submission.columns.values[1]= 'PredictedLogRevenue'

submission_out = submission.groupby('fullVisitorId',as_index = False).sum()
#submission_out.to_csv("kaggle_submission_rf.csv", index = False)

##### Regression Splines Submission

In [215]:
submission = pd.concat([test.fullVisitorId,pd.DataFrame(predict_actual_test_sp)], axis = 1)
submission.columns.values[1]= 'PredictedLogRevenue'

submission_out = submission.groupby('fullVisitorId',as_index = False).sum()
#submission_out.to_csv("kaggle_submission_sp.csv", index = False)