In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999


In [2]:

def dataset(Path,Name):

    pathFile = '\\'.join([Path,Name])

    dict_cols = ['trafficSource','totals','geoNetwork','device'] 

    df = pd.read_csv(pathFile, dtype={'fullVisitorId': 'str'}, nrows=None)

    for column in dict_cols:
        df = df.join(pd.DataFrame(df.pop(column).apply(pd.io.json.loads).values.tolist(), index=df.index))

    try:
        df['Revenue']=df['transactionRevenue']
    except:    
        df['Revenue']=0

    cols = df.columns.tolist()

    print('--'*30)
    print(df.shape)

    liste_features = ['Revenue',
                        'fullVisitorId', 
                        'bounces', 
                        'hits', 
                        'newVisits', 
                        'pageviews', 
                        'visits', 
                        'subContinent', 
                        'deviceCategory', 
                        'isMobile']

    new_df = df[liste_features]

    col_dummies = [                     
                        'subContinent', 
                        'deviceCategory', 
                        'isMobile']

    for col in col_dummies:
        dummies = pd.get_dummies(new_df[col],prefix=col)
        new_df = pd.concat([new_df,dummies],axis=1)
        #new_df.drop(col)


    new_df = new_df.drop(col_dummies,axis=1)

    return new_df


In [3]:
Path = 'c:\\users\\monne\\Desktop\\Google Analytics Customers'
Name = 'train.csv'

train_features = dataset(Path,Name)

------------------------------------------------------------
(903653, 51)


In [4]:
train_features = train_features.fillna(0)
print(train_features.shape)

(903653, 35)


In [5]:
train_target = train_features.pop('Revenue')
train_id = train_features.pop('fullVisitorId')
print(train_target.shape)
print(train_id.shape)

(903653,)
(903653,)


In [6]:
train_targets = train_target.copy()
print(type(train_targets))

<class 'pandas.core.series.Series'>


In [7]:
train_targets = train_targets.fillna(0)
print(train_targets.isna().sum())
train_targets = train_targets.astype(np.float)
train_targets = train_targets + 1
print(train_targets.unique())
print(train_targets.max())
print(train_targets.min())

0
[1.00000000e+00 3.78600010e+07 3.06670001e+08 ... 3.35260001e+08
 3.07500010e+07 6.93900010e+07]
23129500001.0
1.0


In [8]:
train_targets = np.log(train_targets)
print(train_targets.unique())

[ 0.         17.44940573 19.54128281 ... 19.63041691 17.24140058
 18.05525334]


In [9]:
train_target_shape = train_targets.values.reshape(-1,1)
print(train_target_shape)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [10]:
train_targets.describe()

count    903653.000000
mean          0.227118
std           2.003710
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          23.864375
Name: Revenue, dtype: float64

In [11]:
X = train_features.as_matrix().astype(np.float)
y = train_targets.as_matrix().astype(np.float)

In [15]:
for col in train_features.columns:
    print('{} - {} '.format(col,type(col)))
    train_features[col] = train_features[col].astype(np.float)

bounces - <class 'str'> 
hits - <class 'str'> 
newVisits - <class 'str'> 
pageviews - <class 'str'> 
visits - <class 'str'> 
subContinent_(not set) - <class 'str'> 
subContinent_Australasia - <class 'str'> 
subContinent_Caribbean - <class 'str'> 
subContinent_Central America - <class 'str'> 
subContinent_Central Asia - <class 'str'> 
subContinent_Eastern Africa - <class 'str'> 
subContinent_Eastern Asia - <class 'str'> 
subContinent_Eastern Europe - <class 'str'> 
subContinent_Melanesia - <class 'str'> 
subContinent_Micronesian Region - <class 'str'> 
subContinent_Middle Africa - <class 'str'> 
subContinent_Northern Africa - <class 'str'> 
subContinent_Northern America - <class 'str'> 
subContinent_Northern Europe - <class 'str'> 
subContinent_Polynesia - <class 'str'> 
subContinent_South America - <class 'str'> 
subContinent_Southeast Asia - <class 'str'> 
subContinent_Southern Africa - <class 'str'> 
subContinent_Southern Asia - <class 'str'> 
subContinent_Southern Europe - <class 's

In [18]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_features, label=train_targets)
dtest = dtrain

param = {'bst:max_depth':2, 
         'bst:eta':1, 
         'silent':1, 
         'objective':'reg:linear',
         'nthread':4,
         'eval_metric':'rmse'
        }

evallist = [(dtest,'eval'), (dtrain,'train')]

num_round = 10
bst = xgb.train( param, dtrain, num_round, evallist )

[0]	eval-rmse:1.86512	train-rmse:1.86512
[1]	eval-rmse:1.78141	train-rmse:1.78141
[2]	eval-rmse:1.7356	train-rmse:1.7356
[3]	eval-rmse:1.70941	train-rmse:1.70941
[4]	eval-rmse:1.69369	train-rmse:1.69369
[5]	eval-rmse:1.68358	train-rmse:1.68358
[6]	eval-rmse:1.67748	train-rmse:1.67748
[7]	eval-rmse:1.67202	train-rmse:1.67202
[8]	eval-rmse:1.66846	train-rmse:1.66846
[9]	eval-rmse:1.66591	train-rmse:1.66591


In [19]:
Name = 'test.csv'

test_features = dataset(Path,Name)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x000001DEDCE44208>>
Traceback (most recent call last):
  File "C:\Users\monne\Anaconda3\lib\site-packages\xgboost\core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


------------------------------------------------------------
(804684, 49)


In [20]:
test_features = test_features.fillna(0)
print(test_features.shape)

(804684, 35)


In [21]:
test_target = test_features.pop('Revenue')
test_id = test_features.pop('fullVisitorId')
print(test_target.shape)
print(test_id.shape)

(804684,)
(804684,)


In [22]:
test_targets = test_target.copy()
print(type(test_targets))

<class 'pandas.core.series.Series'>


In [23]:
test_targets = test_targets.fillna(0)
print(test_targets.isna().sum())
test_targets = test_targets.astype(np.float)
test_targets = test_targets + 1
print(test_targets.unique())
print(test_targets.max())
print(test_targets.min())

0
[1.]
1.0
1.0


In [24]:
test_targets = np.log(test_targets)
print(test_targets.unique())

[0.]


In [25]:
print(test_features.shape)
print(train_features.shape)
for col in train_features.columns.tolist():
    print(col)


(804684, 33)
(903653, 33)
bounces
hits
newVisits
pageviews
visits
subContinent_(not set)
subContinent_Australasia
subContinent_Caribbean
subContinent_Central America
subContinent_Central Asia
subContinent_Eastern Africa
subContinent_Eastern Asia
subContinent_Eastern Europe
subContinent_Melanesia
subContinent_Micronesian Region
subContinent_Middle Africa
subContinent_Northern Africa
subContinent_Northern America
subContinent_Northern Europe
subContinent_Polynesia
subContinent_South America
subContinent_Southeast Asia
subContinent_Southern Africa
subContinent_Southern Asia
subContinent_Southern Europe
subContinent_Western Africa
subContinent_Western Asia
subContinent_Western Europe
deviceCategory_desktop
deviceCategory_mobile
deviceCategory_tablet
isMobile_False
isMobile_True


In [30]:
for col in test_features.columns:
    print('{} - {} '.format(col,type(col)))
    test_features[col] = test_features[col].astype(np.float)


bounces - <class 'str'> 
hits - <class 'str'> 
newVisits - <class 'str'> 
pageviews - <class 'str'> 
visits - <class 'str'> 
subContinent_(not set) - <class 'str'> 
subContinent_Australasia - <class 'str'> 
subContinent_Caribbean - <class 'str'> 
subContinent_Central America - <class 'str'> 
subContinent_Central Asia - <class 'str'> 
subContinent_Eastern Africa - <class 'str'> 
subContinent_Eastern Asia - <class 'str'> 
subContinent_Eastern Europe - <class 'str'> 
subContinent_Melanesia - <class 'str'> 
subContinent_Micronesian Region - <class 'str'> 
subContinent_Middle Africa - <class 'str'> 
subContinent_Northern Africa - <class 'str'> 
subContinent_Northern America - <class 'str'> 
subContinent_Northern Europe - <class 'str'> 
subContinent_Polynesia - <class 'str'> 
subContinent_South America - <class 'str'> 
subContinent_Southeast Asia - <class 'str'> 
subContinent_Southern Africa - <class 'str'> 
subContinent_Southern Asia - <class 'str'> 
subContinent_Southern Europe - <class 's

In [31]:
datatest = xgb.DMatrix(test_features)

predictions_revenue = bst.predict(datatest)

In [32]:
mse = np.array((test_targets - predictions_revenue)**2).sum()
rmse = np.sqrt(mse)

print('--'*20)
print(f"Mean square error : {mse}")
print(f"Root mean square error : {rmse}")
print('--'*20)


----------------------------------------
Mean square error : 540868.9669724304
Root mean square error : 735.4379422986214
----------------------------------------


In [33]:
sub_df = pd.DataFrame({"fullVisitorId":test_id})
predictions_revenue[predictions_revenue < 0] = 0

In [34]:
sub_df["PredictedLogRevenue"] = np.expm1(predictions_revenue)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]

In [35]:
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])

print('--'*20)
print(sub_df.head())
print('--'*20)


----------------------------------------
         fullVisitorId  PredictedLogRevenue
0  0000000259678714014             0.016037
1  0000049363351866189             0.048620
2  0000053049821714864             0.015813
3  0000059488412965267             0.048620
4  0000085840370633780             0.016037
----------------------------------------


In [36]:
Path = 'c:\\users\\monne\\Desktop'
Name = 'Submission 2018-10-30.csv'
pathFile = '\\'.join([Path,Name])
sub_df.to_csv(pathFile, index=False)
