## Google Analytics - Customer Revenue Prediction

## 1. Getting the data

In [1]:
#### Loading the Required Packages ############
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys

from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

#### Function to convert XML to dataframes 

In [3]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [str(column)+"."+str(subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

#### Calling the input train and test xml files to convert to dataframe

In [4]:
#train = load_df(os.getcwd()+'\\train.csv')
#test = load_df(os.getcwd()+'/test.csv')

#### Saving a CSV copy of generated dataframes

In [5]:
#train.to_csv('train_extracted.csv',encoding='utf8')
#test.to_csv('test_extracted.csv', encoding = 'utf8')

## Reading saved the data
train = pd.read_csv('train_extracted.csv',encoding='utf8',dtype={'fullVisitorId': 'str'})
test = pd.read_csv('test_extracted.csv', encoding = 'utf8',dtype={'fullVisitorId': 'str'})

  interactivity=interactivity, compiler=compiler, result=result)


#### Combining train and test data

For ease in computation of data transformation and other changes, we combine the train and test

In [398]:
train['data_source'] = 'train'
test['data_source'] = 'test'

all_data = pd.concat([train,test], axis = 0)

In [399]:
all_data.shape

(1708337, 56)

## 2. Analyzing the Data

#### Emtpy values 

Checking the ratio of NULL/empty values

In [400]:
100 * float(train['totals_transactionRevenue'].isnull().sum())/len(train['totals_transactionRevenue'])

98.72572768529513

#### Data Snapshot


In [401]:
#all_data.drop([u'Unnamed: 0'], axis = 1, inplace = True)
all_data.head()

Unnamed: 0,channelGrouping,data_source,date,device_browser,device_browserSize,device_browserVersion,device_deviceCategory,device_flashVersion,device_isMobile,device_language,...,trafficSource_campaign,trafficSource_campaignCode,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,visitId,visitNumber,visitStartTime
0,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472830385,1,1472830385
1,Organic Search,train,20160902,Firefox,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472880147,1,1472880147
2,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,(not provided),organic,,google,1472865386,1,1472865386
3,Organic Search,train,20160902,UC Browser,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,...,(not set),,,google + online,organic,,google,1472881213,1,1472881213
4,Organic Search,train,20160902,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,...,(not set),,True,(not provided),organic,,google,1472822600,2,1472822600


In [402]:
#### Distribution of the 'visitNumber' variable
all_data.visitNumber.describe()

count    1.708337e+06
mean     2.335170e+00
std      9.354034e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.570000e+02
Name: visitNumber, dtype: float64

#### Response variable

In [403]:
all_data['totals_transactionRevenue'] = pd.to_numeric(all_data['totals_transactionRevenue'])
all_data['totals_transactionRevenue'].describe()

count    1.151500e+04
mean     1.337448e+08
std      4.482852e+08
min      1.000000e+04
25%      2.493000e+07
50%      4.945000e+07
75%      1.076550e+08
max      2.312950e+10
Name: totals_transactionRevenue, dtype: float64

#### Distribution of other columns apart from Response variable

In [378]:
for each in all_data.columns:
    print(each)
    print(all_data[each].value_counts())
    print("\n")

channelGrouping
Organic Search    738963
Social            354971
Direct            273134
Referral          211307
Display            51283
Paid Search        45627
Affiliates         32915
(Other)              137
Name: channelGrouping, dtype: int64


data_source
train    903653
test     804684
Name: data_source, dtype: int64


date
20171212    9234
20171213    9131
20171004    5122
20170920    4880
20161128    4807
20170921    4715
20161115    4685
20171005    4679
20161114    4466
20161130    4435
20180321    4434
20161026    4375
20161129    4337
20161116    4334
20161004    4322
20161205    4265
20171017    4258
20180327    4227
20170426    4224
20161201    4200
20171127    4194
20161027    4162
20161121    4143
20171108    4136
20161117    4074
20161024    4063
20161110    4055
20171006    4033
20161103    4014
20180226    3989
            ... 
20170430    1594
20161001    1589
20161226    1586
20170115    1576
20170429    1566
20170611    1555
20170204    1549
20170423    1548


not available in demo dataset    1708337
Name: geoNetwork_cityId, dtype: int64


geoNetwork_continent
Americas     877403
Asia         396719
Europe       368037
Africa        35481
Oceania       28180
(not set)      2517
Name: geoNetwork_continent, dtype: int64


geoNetwork_country
United States               717217
India                       105317
United Kingdom               73341
Canada                       51057
Germany                      38516
Japan                        36637
Brazil                       35432
Vietnam                      34869
France                       32289
Thailand                     29859
Turkey                       29166
Taiwan                       26560
Mexico                       25270
Australia                    23660
Spain                        23431
Netherlands                  22146
Italy                        20694
Russia                       19641
Indonesia                    16675
Poland                       15874
Philippines     

Northern America      768345
Southeast Asia        121634
Southern Asia         121062
Western Europe        115153
Northern Europe       111693
Eastern Asia           91072
South America          75112
Eastern Europe         74007
Southern Europe        67184
Western Asia           60966
Central America        29564
Australasia            27867
Northern Africa        17255
Western Africa          7086
Southern Africa         6019
Caribbean               4382
Eastern Africa          4194
(not set)               2517
Central Asia            1985
Middle Africa            927
Melanesia                133
Micronesian Region       128
Polynesia                 52
Name: geoNetwork_subContinent, dtype: int64


sessionId
3369895875786214183_1512028689    2
3429979432955165482_1495781717    2
7632664392581440337_1524293985    2
4853124786988084328_1488527765    2
938343955605842538_1514361502     2
3668154658566154575_1474872860    2
1420483413974308951_1479715163    2
0210271036253197119_15158

not available in demo dataset    1708337
Name: trafficSource_adwordsClickInfo.criteriaParameters, dtype: int64


trafficSource_adwordsClickInfo.gclId
CN_Whvvc_9UCFd6LswodGTgKCQ                                                                      74
Cj0KEQjwmIrJBRCRmJ_x7KDo-9oBEiQAuUPKMufMpuG3ZdwYO8GTsjiBFd5MPHStZa9y_9NCrI8X97oaAglc8P8HAQ      70
COT1-vPT4tYCFZWNswodcwsHxg                                                                      60
CN3fusbjvtYCFQsmhgodIEQO-g                                                                      51
Cj0KEQjw1ee_BRD3hK6x993YzeoBEiQA5RH_BEA562M9tvl_mtnAFvtDnDqOQRp1RvxMMgwjcX1LAfwaAj4o8P8HAQ      41
CI2Ap4L32tYCFY-1wAodeHEHPA                                                                      41
CPSo2PeD3tYCFYk_GwodJrEEUg                                                                      38
CKKrvpbe2dYCFdYSaAodSHAOGw                                                                      37
CP273vOzr9YCFQEHaQoduBgHSA                                

11251kjhkvahf    1
Name: trafficSource_campaignCode, dtype: int64


trafficSource_isTrueDirect
True    534518
Name: trafficSource_isTrueDirect, dtype: int64


trafficSource_keyword
(not provided)                                                                         712754
(User vertical targeting)                                                               26250
(automatic matching)                                                                    18781
6qEhsCssdK0z36ri                                                                        17724
(Remarketing/Content targeting)                                                          7238
1hZbAqLCbjwfgOH7                                                                         3996
google merchandise store                                                                 2987
Google Merchandise                                                                       2128
google store                                                       

google                                               817892
youtube.com                                          329450
(direct)                                             273152
mall.googleplex.com                                  120541
analytics.google.com                                  37436
Partners                                              32931
sites.google.com                                       9072
google.com                                             8341
gdeals.googleplex.com                                  8298
m.facebook.com                                         6737
dfa                                                    6704
baidu                                                  4971
reddit.com                                             4092
facebook.com                                           3584
googleads.g.doubleclick.net                            3539
qiita.com                                              3394
bing                                    

## 3. Data Pre-processing

#### Finding columns with just one value in it

Finding the columns with  <br>
i) If the number of unique values is just one <br>
ii) More than 50% of NA values <br><br>

Dropping such columns



In [404]:
### Selecting columns with just one unique value and those with NULL values greater than 50% of the dataset
drop_columns = [each for each in all_data.columns if len(train[each].unique()) <= 2 or all_data[each].isnull().sum() > 0.5 * all_data.shape[0]]
drop_columns = [each for each in drop_columns if each not in ['totals_transactionRevenue','data_source']]

In [405]:
all_data_upd = all_data.drop(drop_columns, axis = 1)

#### Feature Engineering: Creating date variables

In [406]:
all_data_upd['year'] = all_data_upd.date.apply(lambda x: int(str(x)[0:4]))
all_data_upd['month'] = all_data_upd.date.apply(lambda x: int(str(x)[4:6]))
all_data_upd['day'] = all_data_upd.date.apply(lambda x: int(str(x)[6:8]))

#### Cleaning and imputing train and test separately - since we do not want to use the test data to impute the training data

In [407]:
train_upd = all_data_upd[all_data_upd.data_source == 'train']
test_upd = all_data_upd[all_data_upd.data_source == 'test']

print(train_upd.shape)
print(test_upd.shape)

(903653, 27)
(804684, 27)


Viewing Training dataset

In [408]:
train_upd.head()

Unnamed: 0,channelGrouping,data_source,date,device_browser,device_deviceCategory,device_operatingSystem,fullVisitorId,geoNetwork_city,geoNetwork_continent,geoNetwork_country,...,totals_transactionRevenue,trafficSource_campaign,trafficSource_medium,trafficSource_source,visitId,visitNumber,visitStartTime,year,month,day
0,Organic Search,train,20160902,Chrome,desktop,Windows,1131660440785968503,Izmir,Asia,Turkey,...,,(not set),organic,google,1472830385,1,1472830385,2016,9,2
1,Organic Search,train,20160902,Firefox,desktop,Macintosh,377306020877927890,not available in demo dataset,Oceania,Australia,...,,(not set),organic,google,1472880147,1,1472880147,2016,9,2
2,Organic Search,train,20160902,Chrome,desktop,Windows,3895546263509774583,Madrid,Europe,Spain,...,,(not set),organic,google,1472865386,1,1472865386,2016,9,2
3,Organic Search,train,20160902,UC Browser,desktop,Linux,4763447161404445595,not available in demo dataset,Asia,Indonesia,...,,(not set),organic,google,1472881213,1,1472881213,2016,9,2
4,Organic Search,train,20160902,Chrome,mobile,Android,27294437909732085,not available in demo dataset,Europe,United Kingdom,...,,(not set),organic,google,1472822600,2,1472822600,2016,9,2


#### Imputation for Trainining dataset

In [409]:
## Cleaning the character columns in the data

#Float columns
float_cols = []


for each in train_upd.columns:
    
    if(each == 'data_source'):
        continue
        
    if(train_upd[each].dtype == 'O' and len(train_upd[each].unique()) < 500000):
        print(each)
        ### Skipping the values are actually numeric or float
        
        if( not (len(pd.to_numeric(train_upd[each],errors = 'coerce').unique()) < 0.5 * len(train_upd[each].unique())) ):
            
            train_upd.loc[:,each] = train_upd[each].fillna(train_upd[each].mean())
            float_cols.append(each)
            
        else:


            print(len(train_upd[each].unique()))
            train_upd.loc[:,each] = train_upd[each].apply(lambda x: x.lower().strip())
            extreme_values = train_upd[each].value_counts()[train_upd[each].value_counts() < 10].index

            ## Replacing the outliers (extreme) values with 'Others'

            train_upd.loc[:,each] = train_upd[each].apply(lambda x: np.where(x in extreme_values, 'Others',x))

channelGrouping
8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


device_browser
54
device_deviceCategory
3
device_operatingSystem
20
geoNetwork_city
649
geoNetwork_continent
6
geoNetwork_country
222
geoNetwork_metro
94
geoNetwork_networkDomain
28064
geoNetwork_region
376
geoNetwork_subContinent
23
trafficSource_campaign
10
trafficSource_medium
7
trafficSource_source
380


#### Imputation for Test dataset

In [410]:
## Cleaning the character columns in the data

for each in test_upd.columns:
    
    
    if(each == 'data_source'):
        continue
    
    print(each)
    if(test_upd[each].dtype == 'O' and len(test_upd[each].unique()) < 500000):
            
        ### Skipping the values are actually numeric or float
        
        ### Identifying the columns as numeric/float as per the training data
        if(each in float_cols):
            
            train_upd[each] = train_upd[each].fillna(train_upd[each].mean())
        else:


            print(len(train_upd[each].unique()))
            train_upd[each] = train_upd[each].apply(lambda x: x.lower().strip())
            extreme_values = train_upd[each].value_counts()[train_upd[each].value_counts() < 10].index

            ## Replacing the outliers (extreme) values with 'Others'

            train_upd[each] = train_upd[each].apply(lambda x: np.where(x in extreme_values, 'Others',x))

channelGrouping
8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


date
device_browser
27
device_deviceCategory
3
device_operatingSystem
15
fullVisitorId
geoNetwork_city
482
geoNetwork_continent
6
geoNetwork_country
188
geoNetwork_metro
79
geoNetwork_networkDomain
3017
geoNetwork_region
318
geoNetwork_subContinent
23
sessionId
totals_hits
totals_pageviews
totals_transactionRevenue
trafficSource_campaign
9
trafficSource_medium
7
trafficSource_source
124
visitId
visitNumber
visitStartTime
year
month
day


#### Combining the data again

In [411]:
all_data_upd = pd.concat([train_upd,test_upd], axis = 0)

#### Creating Label Encodings for certain columns

In [412]:
### Getting encoder function for python
### Columns for transforming
## device_browser','geoNetwork_metro','geoNetwork_networkDomain','geoNetwork_region',
## 'trafficSource_source','geoNetwork_city','geoNetwork_country'


from sklearn.preprocessing import LabelEncoder
###################################################
device_browser = LabelEncoder()
device_browser.fit(all_data_upd.device_browser)

all_data_upd['device_browser_enc'] = device_browser.transform(all_data_upd.device_browser)
###############################################
geoNetwork_metro = LabelEncoder()
geoNetwork_metro.fit(all_data_upd.geoNetwork_metro)

all_data_upd['geoNetwork_metro_enc'] = geoNetwork_metro.transform(all_data_upd.geoNetwork_metro)
###############################################
geoNetwork_networkDomain = LabelEncoder()
geoNetwork_networkDomain.fit(all_data_upd.geoNetwork_networkDomain)

all_data_upd['geoNetwork_networkDomain_enc'] = geoNetwork_networkDomain.transform(all_data_upd.geoNetwork_networkDomain)
###############################################
geoNetwork_region = LabelEncoder()
geoNetwork_region.fit(all_data_upd.geoNetwork_region)

all_data_upd['geoNetwork_region_enc'] = geoNetwork_region.transform(all_data_upd.geoNetwork_region)
###############################################
trafficSource_source = LabelEncoder()
trafficSource_source.fit(all_data_upd.trafficSource_source)

all_data_upd['trafficSource_source_enc'] = trafficSource_source.transform(all_data_upd.trafficSource_source)
###############################################
geoNetwork_city = LabelEncoder()
geoNetwork_city.fit(all_data_upd.geoNetwork_city)

all_data_upd['geoNetwork_city_enc'] = geoNetwork_city.transform(all_data_upd.geoNetwork_city)
###############################################
geoNetwork_country = LabelEncoder()
geoNetwork_country.fit(all_data_upd.geoNetwork_country)

all_data_upd['geoNetwork_country_enc'] = geoNetwork_country.transform(all_data_upd.geoNetwork_country)
##############################################

In [413]:
## Columns that has more than ~200 distinct objects will create multiple columns while creating one hot vectors
### So, instead of dropping them, we encode those variables into numerical values

## Dropping such columns
all_data_upd = all_data_upd.drop(['device_browser','geoNetwork_metro','geoNetwork_networkDomain','geoNetwork_region',
                                  'trafficSource_source','geoNetwork_city','geoNetwork_country'], axis = 1)

## Getting the datashape
all_data_upd.shape

(1708337, 27)

In [414]:
all_data_upd['totals_transactionRevenue'] = all_data_upd['totals_transactionRevenue'].fillna(0)
all_data_upd = all_data_upd.fillna(0)

In [415]:
### Removing id columns
id_cols = ['sessionId','fullVisitorId','visitId','date']
all_data_dummies = pd.get_dummies(all_data_upd.drop(id_cols, axis = 1), dummy_na = True)

In [416]:
all_data_dummies.shape

(1708337, 182)

#### Saving the transformed train data

In [417]:
#all_data_dummies.to_csv('all_data_dummies.csv', index = False, encoding = 'utf8')
all_data_dummies = pd.read_csv("all_data_dummies.csv", encoding = 'utf8')

In [418]:
all_data_dummies.shape

(1708337, 182)

#### Dividing the data into train and test

In [419]:
train_dummies = all_data_dummies[all_data_dummies.data_source_train == 1]
test_dummies = all_data_dummies[all_data_dummies.data_source_test == 1]

# train_dummies = all_data_upd[all_data_upd.data_source == 'train']
# test_dummies = all_data_upd[all_data_upd.data_source == 'test']

### Creating the dataset that is model-ready

In [420]:
train_dummies = train_dummies.drop(['data_source_train','data_source_test','data_source_nan'], axis = 1)
test_dummies = test_dummies.drop(['data_source_train','data_source_test','data_source_nan','totals_transactionRevenue'], axis = 1)

#train_dummies = train_dummies.drop(['data_source'], axis = 1)
#test_dummies = test_dummies.drop(['data_source','totals.transactionRevenue'], axis = 1)

#### Creating Validation Dataset (Holdout data)

Note: The response variable is changed to log values of the totals_transactionRevenue for better accuracy. Normal transaction was compared agains log values and log values resulted in better scores

In [421]:
X_train, X_test, y_train, y_test = train_test_split(train_dummies.drop(['totals_transactionRevenue'], axis = 1), \
                  np.log(train_dummies['totals_transactionRevenue']+1),test_size = 0.2, random_state = 9)

In [2]:
X_train.head()

NameError: name 'X_train' is not defined

In [423]:
print(X_train.shape)
print(y_train.shape)

(722922, 178)
(722922,)


In [424]:
#### Response variable for logistic regression
y_train_logistic = np.where(y_train != 0,1,0)

## 4. Model Building

There will be two types of models<br><br>
1) Logistic Regression to predict where we have to predict if the value is zero or non-zero <br>
2) Linear Regression to predict the actual Revenue <br>
    &emsp; 2.1) OLS <br>
    &emsp; 2.2) Random forest <br>
    &emsp; 2.3) Splines <br>

### 4.1 Logistic Regression

 Model to find where we have 0 revenue and 1 (non-zero) revenue

In [269]:
### Model building - logistic regression
log_model = LogisticRegression(C = 0.1,solver = 'newton-cg')
log_model.fit(X_train,y_train_logistic)



722921.9999999999

##### Model Prediction - train and test

In [292]:
#### Predicting the values - Obtaining prediction probability to manually specify a threshold value

threshold = 0.02
pred_vals = log_model.predict_proba(X_train)
pred_vals_test = log_model.predict_proba(X_test)

##Getting prediction 
pred_outs_train = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals)))
pred_outs_test = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals_test)))

### 4.2 Regression Models
#### 4.2.1 OLS Models

In [318]:
lr_model = LinearRegression(n_jobs = -1)

In [319]:
lr_model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [322]:
predict = np.exp(lr_model.predict(X_train))
predict_test = np.exp(lr_model.predict(X_test))


##### Making the predictions zeros where-ever logistic regression predicted zero

predict[pred_outs_train == 0] = 0
predict_test[pred_outs_test == 0] = 0

#### Calculating prediction Error

#### Training data - LMSE

In [348]:
lmse = np.sum(np.log((np.exp(y_train.values) -  pred_outs) ** 2 + 1))/len(y_train.values)
lmse

1.107022330147064

#### Validation data - LMSE

In [349]:
lmse = np.sum(np.log((np.exp(y_test.values) - predict_test)**2 + 1))/len(y_test.values)
lmse

1.3141603875638257

### Predicting the actual test class<br>

Step 1: Run the logistic Regression Model <br>
Step 2: Run the linear Regression Model and make non-zero predictions only for those which had non-zero prediction o/p from logistic regression

#### Step1. Logistic Prediction

In [325]:
pred_vals_actual_test = log_model.predict_proba(test_dummies)

##Getting prediction 
pred_outs_actual_test = np.array(list(map(lambda x: np.where(x[1]>threshold,1,0).sum(),pred_vals_actual_test)))

#### Step2. Linear Regression Prediction

In [328]:
predict_actual_test = lr_model.predict(test_dummies)

predict_actual_test[pred_outs_actual_test == 0] = 0

In [332]:
submission = pd.concat([test.fullVisitorId,pd.DataFrame(pred_outs_actual_test)], axis = 1)
submission.columns.values[1]= 'PredictedLogRevenue'

In [333]:
submission_out = submission.groupby('fullVisitorId',as_index = False).sum()

In [334]:
submission_out.to_csv("kaggle_submission.csv", index = False)

#### 4.2.2 Random forest

In [336]:
from sklearn.ensemble import RandomForestRegressor

In [350]:
#### The arguments of Random Forest were changed constantly based on the error on validation dataset
rf_regression = RandomForestRegressor(n_estimators= 15)

### Fitting the random forest model
rf_regression.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [353]:
#### Predictions for the train and validation dataste

#Prediction for training data
rf_predictions = np.exp(rf_regression.predict(X_train))

#Prediction for validation data
rf_predictions_test = np.exp(rf_regression.predict(X_test))

In [354]:
### Using the predictions from logistic regression - to make the values zeros of those predicted as zeros from logistic regression
rf_predictions[pred_outs_train == 0] = 0
rf_predictions_test[pred_outs_test == 0] = 0


lmse = np.sum(np.log((np.exp(y_train.values) -  rf_predictions) ** 2 + 1))/len(y_train.values)
print("Train Error: ",lmse)

lmse = np.sum(np.log((np.exp(y_test.values) -  rf_predictions_test) ** 2 + 1))/len(y_train.values)
print("Validation Error: ",lmse)

#### The validation error dropped sig

Train Error:  1.1861137129396992
Validation Error:  0.33729268912108146


In [None]:
#### To do:

## For visualiation: https://www.kaggle.com/pavansanagapati/simple-exploration-lgbm-model-lb-1-4187 
## Implement the globe

## For categorical features - implement the Encoders and proper imputation
## For columns with webties - extract the major website name (google.com/google.in have the same major content google)
## Numerical features - better imputation and try to create more variables


## Futher, implement a logistic regression and then perfrom a non-linear model on top
