# Google Analytics Customer Revenue Prediction

## 1. Import Required Libraries

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
%matplotlib inline

## 2. Data Loading 

In [5]:
def load_df(csv_path='../data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path,
                     parse_dates=['date'],
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

In [6]:
%%time
train = load_df()
test = load_df('../data/test.csv')

CPU times: user 3min 55s, sys: 12.3 s, total: 4min 7s
Wall time: 4min 5s


## 3. Feature Exploration

### Data Fields
* fullVisitorId- A unique identifier for each user of the Google Merchandise Store.
* channelGrouping - The channel via which the user came to the Store.
* date - The date on which the user visited the Store.
* device - The specifications for the device used to access the Store.
* geoNetwork - This section contains information about the geography of the user.
* sessionId - A unique identifier for this visit to the store.
* socialEngagementType - Engagement type, either "Socially Engaged" or "Not Socially Engaged".
* totals - This section contains aggregate values across the session.
* trafficSource - This section contains information about the Traffic Source from which the session originated.
* visitId - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you *  should use a combination of fullVisitorId and visitId.
* visitNumber - The session number for this user. If this is the first session, then this is set to 1.
* visitStartTime - The timestamp (expressed as POSIX time).

In [7]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,2016-09-02,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
1,Organic Search,2016-09-02,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
2,Organic Search,2016-09-02,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
3,Organic Search,2016-09-02,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,...,,,,(not set),,,google + online,organic,,google
4,Organic Search,2016-09-02,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,...,,,,(not set),,True,(not provided),organic,,google


In [8]:
test.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,2017-10-16,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,not available in demo dataset,...,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,2017-10-16,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,2017-10-16,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,2017-10-16,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,2017-10-16,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google


### 3-1 Missing Column
* what is a missing column of test data?

In [9]:
train_col = train.columns
test_col = test.columns

In [10]:
train_col.isin(test_col)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True])

In [11]:
train_col[-16], train_col[-6], 

('totals.transactionRevenue', 'trafficSource.campaignCode')

In [12]:
#drop the missing column of train data
train=train.drop('trafficSource.campaignCode', axis=1)

In [13]:
train.shape, test.shape

((903653, 54), (804684, 53))

### 3-2 Feature Engineering
* drop useless features
* classifing dtypes of features

In [14]:
Number=['visitNumber','totals.bounces','totals.hits','totals.newVisits','totals.pageviews','totals.transactionRevenue']
useless=['sessionId','socialEngagementType','device.browser','device.browserSize','device.browserVersion','device.flashVersion','device.language',
         'device.mobileDeviceBranding', 'device.mobileDeviceInfo','device.mobileDeviceMarketingName','device.mobileDeviceModel','device.mobileInputSelector',
         'device.operatingSystemVersion','device.screenColors','device.screenResolution','geoNetwork.cityId','geoNetwork.latitude','geoNetwork.longitude',
         'geoNetwork.networkDomain','geoNetwork.networkLocation','trafficSource.adContent','trafficSource.adwordsClickInfo.criteriaParameters',
         'trafficSource.adwordsClickInfo.gclId','trafficSource.adwordsClickInfo.isVideoAd','trafficSource.adwordsClickInfo.page','trafficSource.adwordsClickInfo.slot',
         'trafficSource.referralPath','trafficSource.source']        
ID=['fullVisitorId']
date=['date']
timestamp=['visitStartTime']
ambiguous=['sessionId','visitId','device.operatingSystem','totals.visits']
blooen=['device.isMobile']

In [15]:
train.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.browserSize', 'device.browserVersion',
       'device.deviceCategory', 'device.flashVersion', 'device.isMobile',
       'device.language', 'device.mobileDeviceBranding',
       'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName',
       'device.mobileDeviceModel', 'device.mobileInputSelector',
       'device.operatingSystem', 'device.operatingSystemVersion',
       'device.screenColors', 'device.screenResolution', 'geoNetwork.city',
       'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.networkLocation',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'totals.transactionRevenue

In [16]:
print(train['trafficSource.source'].unique())
print(train['trafficSource.source'].nunique())
train['trafficSource.source'].isnull().sum()/train.shape[0]

['google' 'baidu' 'googleweblight.com' 'sites.google.com' 'bing'
 'siliconvalley.about.com' 'analytics.google.com' 'Partners' '(direct)'
 'mall.googleplex.com' 'google.com' 'google.co.in' 'qiita.com' 'google.pl'
 'mail.google.com' 'hangouts.google.com' 'dealspotr.com'
 'optimize.google.com' 'arstechnica.com' 'dfa' 'docs.google.com'
 'sashihara.jp' 'google.gatewaycdi.com' 'phandroid.com' 'images.google'
 'youtube.com' 'l.facebook.com' 'facebook.com' 'quora.com'
 'm.facebook.com' 'reddit.com' 'google.co.jp' 'aol' 'moma.corp.google.com'
 'blog.golang.org' 'yahoo' 't.co' 'nordic.businessinsider.com'
 'duckduckgo.com' 'm.baidu.com' 'tpc.googlesyndication.com'
 'googleads.g.doubleclick.net'
 '0.shared.bow.cat2.ads-bow.vu.borg.google.com:9856' 'online-metrics.com'
 'cv.il3.ub.edu' 'gophergala.com' 'plus.google.com' 's0.2mdn.net'
 'keep.google.com' 'svirkar.mtv.corp.google.com:8888'
 'connect.googleforwork.com' 'google.com.pk' 'support.google.com'
 'gdeals.googleplex.com' 'google.com.au' 'int.

0.0

In [17]:
print(test['trafficSource.source'].unique())
print(test['trafficSource.source'].nunique())
test['trafficSource.source'].isnull().sum()/test.shape[0]

['google' '(direct)' 'sites.google.com' 'bing' 'mall.googleplex.com'
 'analytics.google.com' 'gdeals.googleplex.com' 'productforums.google.com'
 'youtube.com' 'google.com' 'blog.golang.org' 'l.facebook.com'
 'reddit.com' 'Partners' 'google.co.jp' 'groups.google.com' 'google.co.uk'
 'quora.com' 'sashihara.jp' 'qiita.com' 'baidu' 'docs.google.com' 'dfa'
 '0.shared.bow.cat2.ads-bow.yw.borg.google.com:9860' 'm.facebook.com'
 'tpc.googlesyndication.com' 'optimize.google.com' 'lunametrics.com'
 'yahoo' 'mail.google.com' 'google.com.au' 'm.youtube.com' 'pinterest.com'
 'allo.google.com' 'ask' 'support.google.com' 'int.search.tb.ask.com'
 'search.mysearch.com' 'adwords.google.com' 'blackboard.vsu.edu'
 'web.whatsapp.com' 's0.2mdn.net' 'my.uclaextension.edu'
 'googleads.g.doubleclick.net' 'yandex' 'linkedin.com' 'facebook.com'
 'phandroid.com' 'grow.googleplex.com' 'google.co.th'
 'moma.corp.google.com' 'canopy.uc.edu' 'googleux.perksplus.com'
 'lm.facebook.com' 'away.vk.com' 'plus.url.google.c

0.0

In [18]:
train = train.drop(useless, axis=1)
test =test.drop(useless, axis=1)

In [19]:
train.shape[1], test.shape[1]

(26, 25)

In [20]:
train.to_csv("../data/train_0929.csv", index=False)
test.to_csv("../data/test_0929.csv", index=False)

In [21]:
train=pd.read_csv('../data/train_0929.csv')
test=pd.read_csv('../data/test_0929.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### 3-3 Feature Preprocessing
* EDA
* missing values
* dtypes
* feature encoding

In [22]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,...,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,totals.visits,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium
0,Organic Search,2016-09-02,1131660440785968503,1472830385,1,1472830385,desktop,False,Windows,Izmir,...,1,1.0,1.0,,1,,(not set),,(not provided),organic
1,Organic Search,2016-09-02,377306020877927890,1472880147,1,1472880147,desktop,False,Macintosh,not available in demo dataset,...,1,1.0,1.0,,1,,(not set),,(not provided),organic
2,Organic Search,2016-09-02,3895546263509774583,1472865386,1,1472865386,desktop,False,Windows,Madrid,...,1,1.0,1.0,,1,,(not set),,(not provided),organic
3,Organic Search,2016-09-02,4763447161404445595,1472881213,1,1472881213,desktop,False,Linux,not available in demo dataset,...,1,1.0,1.0,,1,,(not set),,google + online,organic
4,Organic Search,2016-09-02,27294437909732085,1472822600,2,1472822600,mobile,True,Android,not available in demo dataset,...,1,,1.0,,1,,(not set),True,(not provided),organic


In [25]:
train.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'visitId', 'visitNumber',
       'visitStartTime', 'device.deviceCategory', 'device.isMobile',
       'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
       'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',
       'totals.visits', 'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.campaign', 'trafficSource.isTrueDirect',
       'trafficSource.keyword', 'trafficSource.medium'],
      dtype='object')