In [1]:
# download the libraries
import pandas as pd
from pandas import Series,DataFrame
import warnings
from copy import deepcopy
import os
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import random
from pandas.io.json import json_normalize
%matplotlib inline

In [2]:
# load both of the datasets
traindata = pd.read_pickle('train_flat_no_hits.pkl')
testdata = pd.read_pickle('test_flat_no_hits.pkl')


In [5]:
# add a marker so that you know which dataset is which
traindata['dataset'] = 'train'
testdata['dataset'] = 'test'


In [7]:
traindata.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'socialEngagementType',
       'visitId', 'visitNumber', 'visitStartTime', 'device.browser',
       'device.browserVersion', 'device.browserSize', 'device.operatingSystem',
       'device.operatingSystemVersion', 'device.isMobile',
       'device.mobileDeviceBranding', 'device.mobileDeviceModel',
       'device.mobileInputSelector', 'device.mobileDeviceInfo',
       'device.mobileDeviceMarketingName', 'device.flashVersion',
       'device.language', 'device.screenColors', 'device.screenResolution',
       'device.deviceCategory', 'geoNetwork.continent',
       'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.region',
       'geoNetwork.metro', 'geoNetwork.city', 'geoNetwork.cityId',
       'geoNetwork.networkDomain', 'geoNetwork.latitude',
       'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits',
       'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits',
       'totals.sessionQuality

In [12]:
traindata.drop('trafficSource.campaignCode',axis=1,inplace= True) #the test doesnt have trafficSource.campaignCode column, 
#so i droped it from the train set

In [11]:
testdata.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'socialEngagementType',
       'visitId', 'visitNumber', 'visitStartTime', 'device.browser',
       'device.browserVersion', 'device.browserSize', 'device.operatingSystem',
       'device.operatingSystemVersion', 'device.isMobile',
       'device.mobileDeviceBranding', 'device.mobileDeviceModel',
       'device.mobileInputSelector', 'device.mobileDeviceInfo',
       'device.mobileDeviceMarketingName', 'device.flashVersion',
       'device.language', 'device.screenColors', 'device.screenResolution',
       'device.deviceCategory', 'geoNetwork.continent',
       'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.region',
       'geoNetwork.metro', 'geoNetwork.city', 'geoNetwork.cityId',
       'geoNetwork.networkDomain', 'geoNetwork.latitude',
       'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits',
       'totals.hits', 'totals.pageviews', 'totals.timeOnSite',
       'totals.sessionQualityDim', 'totals.new

In [13]:
# make sure the columns are in the same order in both train and test.
# the deepcopy is just to have a copy of the original even after joining them
col_order = list(set(traindata.columns.tolist() + testdata.columns.tolist()))
traindata = deepcopy(traindata[col_order])
testdata = deepcopy(testdata[col_order])


In [53]:
# append the two together
all_data = traindata[col_order].append(testdata)

In [22]:
#all_data.head()

In [17]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2109926 entries, 0 to 401588
Data columns (total 60 columns):
 #   Column                                             Dtype 
---  ------                                             ----- 
 0   device.browser                                     object
 1   totals.transactionRevenue                          object
 2   trafficSource.adwordsClickInfo.gclId               object
 3   totals.hits                                        object
 4   totals.timeOnSite                                  object
 5   trafficSource.adwordsClickInfo.criteriaParameters  object
 6   trafficSource.medium                               object
 7   trafficSource.adwordsClickInfo.page                object
 8   device.language                                    object
 9   geoNetwork.metro                                   object
 10  totals.sessionQualityDim                           object
 11  device.mobileInputSelector                         object
 12  t

In [18]:
def values_missing(check):
    total = check.isnull().sum().sort_values(ascending = False) # getting the sum of null values and ordering
    percent = (check.isnull().sum() / check.isnull().count() * 100 ).sort_values(ascending = False) #getting the percent and order of null
    df_train = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) # Concatenating the total and percent
    print("Total and percent of missing values: ")
    print (df_train[~(df_train['Total'] == 0)]) # Returning values of nulls different of 0
    
    return

In [19]:
values_missing(all_data)

Total and percent of missing values: 
                                                Total    Percent
totals.totalTransactionRevenue                2086818  98.904796
totals.transactionRevenue                     2086818  98.904796
totals.transactions                           2085062  98.821570
trafficSource.adwordsClickInfo.slot           2024047  95.929762
trafficSource.adwordsClickInfo.isVideoAd      2024047  95.929762
trafficSource.adwordsClickInfo.adNetworkType  2024047  95.929762
trafficSource.adwordsClickInfo.page           2024047  95.929762
trafficSource.adwordsClickInfo.gclId          2023891  95.922369
trafficSource.adContent                       1643600  77.898467
trafficSource.isTrueDirect                    1426999  67.632656
trafficSource.referralPath                    1142073  54.128581
trafficSource.keyword                         1093006  51.803049
totals.timeOnSite                             1057980  50.142991
totals.bounces                                105567

In [20]:
def see_data(data, data_type=object, limit=5): #seting the function with data,show 5 uniques if available 
    n = data.select_dtypes(include=data_type) #selecting the desired data type
    for column in n.columns: #initializing the loop
        print("______________________________________________________________")
        print("Name of column ", column, ': \n', "Uniques: ", data[column].unique()[:limit], "\n",
              " | >> Total nulls: ", (round(data[column].isnull().sum() / len(data[column]) * 100,1)),
              " | >> Total unique values: ", all_data.nunique()[column]) #print the data and % of nulls)
        print("_________________________________________________________")

In [21]:
see_data(traindata)

______________________________________________________________
Name of column  device.browser : 
 Uniques:  ['Firefox' 'Chrome' 'Safari' 'UC Browser' 'Internet Explorer'] 
  | >> Total nulls:  0.0  | >> Total unique values:  161
_________________________________________________________
______________________________________________________________
Name of column  totals.transactionRevenue : 
 Uniques:  [nan '15190000' '8000000' '57300000' '18230000'] 
  | >> Total nulls:  98.9  | >> Total unique values:  8219
_________________________________________________________
______________________________________________________________
Name of column  trafficSource.adwordsClickInfo.gclId : 
 Uniques:  [nan
 'Cj0KCQjwsZHPBRClARIsAC-VMPBHdNF2oMOgh6Xp6YhjXWSk11sDu3eCo4De_u6l4xafqgrPVPSGYdkaArkiEALw_wcB'
 'CODVoMjJ9tYCFUIvgQod_dsKEA'
 'Cj0KCQjwsZHPBRClARIsAC-VMPA4CVJtDhu1lYkB0AR1hje1goUMHQZNJMrsSWD57pLnv1KE_L7pCEcaAunKEALw_wcB'
 'Cj0KCQjwsZHPBRClARIsAC-VMPDlLD6kS4tmqFGZjMUqyerwe0WDaULaAKNkj-gwuvPn

In [55]:
drop_col_list = ['socialEngagementType', 'device.browserVersion', 'device.browserSize', 'device.operatingSystemVersion',
                 'device.mobileDeviceBranding', 'device.mobileDeviceModel', 'device.mobileInputSelector', 
                 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.flashVersion', 'device.language', 
                 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 
                 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'trafficSource.campaign', 'trafficSource.referralPath', 
                 'trafficSource.adwordsClickInfo.page','trafficSource.adwordsClickInfo.slot', 
                 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.gclId', 
                 'customDimension.index','totals.visits','trafficSource.keyword']

In [56]:
all_data.drop(drop_col_list, axis=1, inplace=True)  #to drop them

In [57]:
all_data['totals.pageviews'].fillna(1, inplace=True) #filling NA's with 1(since least for a visit to the site means 1 page seen)
all_data['totals.newVisits'].fillna(0, inplace=True) #filling NA's with 0(0 is suitable for Nan here)
all_data['totals.bounces'].fillna(0, inplace=True)   #filling NA's with 0(0 is suitable for Nan here)
all_data["totals.transactionRevenue"] = all_data["totals.transactionRevenue"].fillna(0.0).astype(float) #filling NA with zero
all_data["totals.totalTransactionRevenue"] = all_data["totals.totalTransactionRevenue"].fillna(0.0).astype(float) #filling NA with zero
all_data["totals.transactionRevenue"] = all_data["totals.transactionRevenue"].fillna(0.0).astype(float) #filling NA with zero
all_data["totals.transactions"] = all_data["totals.transactions"].fillna(0).astype(int) #filling NA with zero
all_data['trafficSource.isTrueDirect'].fillna(False, inplace=True) # filling boolean with False is its NAn
all_data['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean is its NAn
all_data['customDimension.value'].fillna('(not set)', inplace=True) # filling NA with '(not set)'
all_data['trafficSource.adwordsClickInfo.adNetworkType'].fillna('(not set)', inplace=True) # filling NA with '(not set)'
all_data['totals.timeOnSite'].fillna(0, inplace=True) #filling NA's with 0(0 is suitable for Nan here)
all_data['totals.sessionQualityDim'].fillna(0, inplace=True) #filling NA's with 0(0 is suitable for Nan here)
#To convert
all_data['totals.pageviews'] = all_data['totals.pageviews'].astype(int) # setting numerical column as integer
all_data['totals.newVisits'] = all_data['totals.newVisits'].astype(int) # setting numerical column as integer
all_data['totals.bounces'] = all_data['totals.bounces'].astype(int)  # setting numerical column as integer
all_data["totals.hits"] = all_data["totals.hits"].astype(float) # setting numerical to float
all_data['totals.sessionQualityDim'] = all_data['totals.sessionQualityDim'].astype(int)
all_data['totals.timeOnSite'] = all_data['totals.timeOnSite'].astype(int)
all_data['date'] = pd.to_datetime(all_data["date"], format="%Y%m%d") #convert to datetime

In [58]:
all_data['geoNetwork.metro'].replace({"not available in demo dataset": "(not set)"}, inplace=True)

In [27]:
all_data.isnull().sum()

device.browser                                  0
totals.transactionRevenue                       0
totals.hits                                     0
totals.timeOnSite                               0
trafficSource.medium                            0
geoNetwork.metro                                0
totals.sessionQualityDim                        0
totals.pageviews                                0
customDimension.value                           0
totals.totalTransactionRevenue                  0
date                                            0
visitId                                         0
trafficSource.isTrueDirect                      0
visitNumber                                     0
geoNetwork.region                               0
device.deviceCategory                           0
geoNetwork.city                                 0
channelGrouping                                 0
trafficSource.adwordsClickInfo.adNetworkType    0
fullVisitorId                                   0


In [28]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2109926 entries, 0 to 401588
Data columns (total 33 columns):
 #   Column                                        Dtype         
---  ------                                        -----         
 0   device.browser                                object        
 1   totals.transactionRevenue                     float64       
 2   totals.hits                                   float64       
 3   totals.timeOnSite                             int32         
 4   trafficSource.medium                          object        
 5   geoNetwork.metro                              object        
 6   totals.sessionQualityDim                      int32         
 7   totals.pageviews                              int32         
 8   customDimension.value                         object        
 9   totals.totalTransactionRevenue                float64       
 10  date                                          datetime64[ns]
 11  visitId                  

In [29]:
all_data.columns

Index(['device.browser', 'totals.transactionRevenue', 'totals.hits',
       'totals.timeOnSite', 'trafficSource.medium', 'geoNetwork.metro',
       'totals.sessionQualityDim', 'totals.pageviews', 'customDimension.value',
       'totals.totalTransactionRevenue', 'date', 'visitId',
       'trafficSource.isTrueDirect', 'visitNumber', 'geoNetwork.region',
       'device.deviceCategory', 'geoNetwork.city', 'channelGrouping',
       'trafficSource.adwordsClickInfo.adNetworkType', 'fullVisitorId',
       'device.isMobile', 'geoNetwork.networkDomain', 'totals.newVisits',
       'geoNetwork.subContinent', 'trafficSource.adwordsClickInfo.isVideoAd',
       'visitStartTime', 'device.operatingSystem', 'geoNetwork.continent',
       'dataset', 'geoNetwork.country', 'totals.bounces',
       'totals.transactions', 'trafficSource.source'],
      dtype='object')

In [25]:
'''
columns to hotcode

channelGrouping                               object  unique values:  8
date                                          int64  - convert to date time
fullVisitorId                                 object - organisers insist to be left as str
device.browser                                object - unique values:  129
device.operatingSystem                        object - unique values:  24
device.isMobile                               bool   - unique values:  2
device.deviceCategory                         object - unique values:  3
geoNetwork.continent                          object - unique values:  6
geoNetwork.subContinent                       object - unique values:  23
geoNetwork.country                            object - unique values:  228
geoNetwork.region                             object - unique values:  284
geoNetwork.metro                              object - unique values:  123
geoNetwork.city                               object - unique values:  956
geoNetwork.networkDomain                      object - unique values:  41982
totals.sessionQualityDim                      object - convert to interger
totals.timeOnSite                             object - convert to interger
trafficSource.source                          object - unique values:  345
trafficSource.medium                          object - unique values:  7
trafficSource.isTrueDirect                    bool   - unique values:  2 
trafficSource.adwordsClickInfo.adNetworkType  object - unique values:  3
trafficSource.adwordsClickInfo.isVideoAd      bool   - unique values:  2 
customDimension.value                         object - unique values:  5
'''

'\ncolumns to hotcode\nchannelGrouping                               object \ndate                                          int64  - convert to date time\nfullVisitorId                                 object \ndevice.browser                                object \ndevice.operatingSystem                        object\ndevice.isMobile                               bool   \ndevice.deviceCategory                         object \ngeoNetwork.continent                          object \ngeoNetwork.subContinent                       object \ngeoNetwork.country                            object \ngeoNetwork.region                             object \ngeoNetwork.metro                              object \ngeoNetwork.city                               object \ngeoNetwork.networkDomain                      object \ntotals.sessionQualityDim                      object \ntotals.timeOnSite                             object \ntrafficSource.source                          object \ntrafficSource.medium 

In [59]:
cat_cols = ['device.browser', 'totals.hits','trafficSource.medium', 'geoNetwork.metro', 'customDimension.value',
            'visitId','trafficSource.isTrueDirect','geoNetwork.region','device.deviceCategory', 'geoNetwork.city', 
            'channelGrouping','trafficSource.adwordsClickInfo.adNetworkType', 'fullVisitorId',
            'device.isMobile', 'geoNetwork.networkDomain','geoNetwork.subContinent', 'trafficSource.adwordsClickInfo.isVideoAd',
            'device.operatingSystem', 'geoNetwork.continent','geoNetwork.country','trafficSource.source']
   

In [60]:
from sklearn.preprocessing import LabelEncoder
leb = LabelEncoder()

In [65]:
all_data

Unnamed: 0,device.browser,totals.transactionRevenue,totals.hits,totals.timeOnSite,trafficSource.medium,geoNetwork.metro,totals.sessionQualityDim,totals.pageviews,customDimension.value,totals.totalTransactionRevenue,...,geoNetwork.subContinent,trafficSource.adwordsClickInfo.isVideoAd,visitStartTime,device.operatingSystem,geoNetwork.continent,dataset,geoNetwork.country,totals.bounces,totals.transactions,trafficSource.source
0,Firefox,0.0,1.0,0,organic,(not set),1,1,EMEA,0.0,...,Western Europe,True,1508198450,Windows,Europe,train,Germany,1,0,google
1,Chrome,0.0,2.0,28,referral,San Francisco-Oakland-San Jose CA,2,2,North America,0.0,...,Northern America,True,1508176307,Chrome OS,Americas,train,United States,0,0,sites.google.com
2,Chrome,0.0,2.0,38,(none),(not set),1,2,North America,0.0,...,Northern America,True,1508201613,Android,Americas,train,United States,0,0,(direct)
3,Chrome,0.0,2.0,1,organic,(not set),1,2,EMEA,0.0,...,Western Asia,True,1508169851,Windows,Asia,train,Turkey,0,0,google
4,Chrome,0.0,2.0,52,organic,(not set),1,2,Central America,0.0,...,Central America,True,1508190552,Windows,Americas,train,Mexico,0,0,google
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401584,Chrome,0.0,3.0,111,organic,(not set),1,3,EMEA,0.0,...,Northern Europe,True,1536353803,Android,Europe,test,United Kingdom,0,0,google
401585,Chrome,0.0,3.0,10,organic,(not set),1,3,North America,0.0,...,Northern America,True,1536388075,Windows,Americas,test,United States,0,0,google
401586,Firefox,0.0,3.0,63,organic,(not set),1,3,(not set),0.0,...,Southern Europe,True,1536351791,Windows,Europe,test,Greece,0,0,google
401587,Chrome,0.0,3.0,64,organic,(not set),1,3,North America,0.0,...,Northern America,True,1536340217,Chrome OS,Americas,test,United States,0,0,google


In [68]:
test_2nd = deepcopy(all_data)

In [71]:
for col in cat_cols:
    test_2nd[col] = leb.fit_transform(test_2nd[col])

In [72]:
test_2nd

Unnamed: 0,device.browser,totals.transactionRevenue,totals.hits,totals.timeOnSite,trafficSource.medium,geoNetwork.metro,totals.sessionQualityDim,totals.pageviews,customDimension.value,totals.totalTransactionRevenue,...,geoNetwork.subContinent,trafficSource.adwordsClickInfo.isVideoAd,visitStartTime,device.operatingSystem,geoNetwork.continent,dataset,geoNetwork.country,totals.bounces,totals.transactions,trafficSource.source
0,64,0.0,0,0,5,0,1,1,3,0.0,...,22,1,1508198450,22,4,train,75,1,0,117
1,55,0.0,1,28,6,104,2,2,4,0.0,...,12,1,1508176307,3,2,train,219,0,0,339
2,55,0.0,1,38,0,0,1,2,4,0.0,...,12,1,1508201613,1,2,train,219,0,0,0
3,55,0.0,1,1,5,0,1,2,3,0.0,...,21,1,1508169851,22,3,train,211,0,0,117
4,55,0.0,1,52,5,0,1,2,2,0.0,...,3,1,1508190552,22,2,train,132,0,0,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401584,55,0.0,2,111,5,0,1,3,3,0.0,...,13,1,1536353803,1,4,test,218,0,0,117
401585,55,0.0,2,10,5,0,1,3,4,0.0,...,12,1,1536388075,22,2,test,219,0,0,117
401586,64,0.0,2,63,5,0,1,3,0,0.0,...,19,1,1536351791,22,4,test,78,0,0,117
401587,55,0.0,2,64,5,0,1,3,4,0.0,...,12,1,1536340217,3,2,test,219,0,0,117


In [107]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score as accuracy
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


In [108]:
test_1k = test_2nd.head(100000)

In [109]:
x_1k = test_1k.drop(['totals.transactionRevenue','date','dataset','totals.totalTransactionRevenue'],axis=1)
y_1k = test_1k['totals.transactionRevenue'].astype(int)

In [110]:
x_train, x_test, y_train, y_test = train_test_split(x_1k, y_1k, random_state=111)

model1k = RandomForestRegressor(
    n_estimators=50, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=0
)

trained_model1k = model1k.fit(x_train, y_train)

test_predictions1k = trained_model1k.predict(x_test)

r2_score(y_test, test_predictions1k)

0.255205587881809

In [111]:
x_train, x_test, y_train, y_test = train_test_split(x_1k, y_1k, random_state=40)

model = RandomForestClassifier(
    n_estimators=100, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=0
)

trained_model = model.fit(x_train, y_train)

test_predictions = trained_model.predict(x_test)

accuracy(y_test, test_predictions)

0.99