In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999


In [2]:

def dataset(Path,Name):

    pathFile = '\\'.join([Path,Name])

    dict_cols = ['trafficSource','totals','geoNetwork','device'] 

    df = pd.read_csv(pathFile, dtype={'fullVisitorId': 'str'}, nrows=None)

    for column in dict_cols:
        df = df.join(pd.DataFrame(df.pop(column).apply(pd.io.json.loads).values.tolist(), index=df.index))

    try:
        df['Revenue']=df['transactionRevenue']
    except:    
        df['Revenue']=0

    cols = df.columns.tolist()

    print('--'*30)
    print(df.shape)

    liste_features = ['Revenue',
                        'fullVisitorId', 
                        'bounces', 
                        'hits', 
                        'newVisits', 
                        'pageviews', 
                        'visits', 
                        'subContinent', 
                        'deviceCategory', 
                        'isMobile']

    new_df = df[liste_features]

    col_dummies = [                     
                        'subContinent', 
                        'deviceCategory', 
                        'isMobile']

    for col in col_dummies:
        dummies = pd.get_dummies(new_df[col],prefix=col)
        new_df = pd.concat([new_df,dummies],axis=1)
        #new_df.drop(col)


    new_df = new_df.drop(col_dummies,axis=1)

    return new_df


In [3]:
Path = 'c:\\users\\monne\\Desktop\\Google Analytics Customers'
Name = 'train.csv'

train_features = dataset(Path,Name)

------------------------------------------------------------
(903653, 51)


In [4]:
train_features = train_features.fillna(0)
print(train_features.shape)

(903653, 35)


In [5]:
train_target = train_features.pop('Revenue')
train_id = train_features.pop('fullVisitorId')
print(train_target.shape)
print(train_id.shape)

(903653,)
(903653,)


In [6]:
train_targets = train_target.copy()
print(type(train_targets))

<class 'pandas.core.series.Series'>


In [7]:
train_targets = train_targets.fillna(0)
print(train_targets.isna().sum())
train_targets = train_targets.astype(np.float)
train_targets = train_targets + 1
print(train_targets.unique())
print(train_targets.max())
print(train_targets.min())

0
[1.00000000e+00 3.78600010e+07 3.06670001e+08 ... 3.35260001e+08
 3.07500010e+07 6.93900010e+07]
23129500001.0
1.0


In [8]:
train_targets = np.log(train_targets)
print(train_targets.unique())

[ 0.         17.44940573 19.54128281 ... 19.63041691 17.24140058
 18.05525334]


In [9]:
train_target_shape = train_targets.values.reshape(-1,1)
print(train_target_shape)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [10]:
train_targets.describe()

count    903653.000000
mean          0.227118
std           2.003710
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          23.864375
Name: Revenue, dtype: float64

In [11]:
X = train_features.as_matrix().astype(np.float)
y = train_targets.as_matrix().astype(np.float)

In [12]:
for col in train_features.columns:
    print('{} - {} '.format(col,type(col)))
    train_features[col] = train_features[col].astype(np.float)

bounces - <class 'str'> 
hits - <class 'str'> 
newVisits - <class 'str'> 
pageviews - <class 'str'> 
visits - <class 'str'> 
subContinent_(not set) - <class 'str'> 
subContinent_Australasia - <class 'str'> 
subContinent_Caribbean - <class 'str'> 
subContinent_Central America - <class 'str'> 
subContinent_Central Asia - <class 'str'> 
subContinent_Eastern Africa - <class 'str'> 
subContinent_Eastern Asia - <class 'str'> 
subContinent_Eastern Europe - <class 'str'> 
subContinent_Melanesia - <class 'str'> 
subContinent_Micronesian Region - <class 'str'> 
subContinent_Middle Africa - <class 'str'> 
subContinent_Northern Africa - <class 'str'> 
subContinent_Northern America - <class 'str'> 
subContinent_Northern Europe - <class 'str'> 
subContinent_Polynesia - <class 'str'> 
subContinent_South America - <class 'str'> 
subContinent_Southeast Asia - <class 'str'> 
subContinent_Southern Africa - <class 'str'> 
subContinent_Southern Asia - <class 'str'> 
subContinent_Southern Europe - <class 's

In [15]:
dimension = train_features.shape[1]
dimension

33

In [36]:
from keras.models import Sequential
from keras.layers import Dense

# Define model
model = Sequential()
model.add(Dense(200, input_dim=dimension, activation= "relu"))
model.add(Dense(100, activation= "relu"))
model.add(Dense(1))
model.summary() #Print model Summary

# Compile model
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

# Fit Model
model.fit(train_features, train_targets, epochs=20)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 200)               6800      
_________________________________________________________________
dense_8 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 27,001
Trainable params: 27,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x156a135d208>

In [17]:
Name = 'test.csv'

test_features = dataset(Path,Name)

------------------------------------------------------------
(804684, 49)


In [18]:
test_features = test_features.fillna(0)
print(test_features.shape)

(804684, 35)


In [19]:
test_target = test_features.pop('Revenue')
test_id = test_features.pop('fullVisitorId')
print(test_target.shape)
print(test_id.shape)

(804684,)
(804684,)


In [20]:
test_targets = test_target.copy()
print(type(test_targets))

<class 'pandas.core.series.Series'>


In [21]:
test_targets = test_targets.fillna(0)
print(test_targets.isna().sum())
test_targets = test_targets.astype(np.float)
test_targets = test_targets + 1
print(test_targets.unique())
print(test_targets.max())
print(test_targets.min())

0
[1.]
1.0
1.0


In [22]:
test_targets = np.log(test_targets)
print(test_targets.unique())

[0.]


In [23]:
print(test_features.shape)
print(train_features.shape)
for col in train_features.columns.tolist():
    print(col)


(804684, 33)
(903653, 33)
bounces
hits
newVisits
pageviews
visits
subContinent_(not set)
subContinent_Australasia
subContinent_Caribbean
subContinent_Central America
subContinent_Central Asia
subContinent_Eastern Africa
subContinent_Eastern Asia
subContinent_Eastern Europe
subContinent_Melanesia
subContinent_Micronesian Region
subContinent_Middle Africa
subContinent_Northern Africa
subContinent_Northern America
subContinent_Northern Europe
subContinent_Polynesia
subContinent_South America
subContinent_Southeast Asia
subContinent_Southern Africa
subContinent_Southern Asia
subContinent_Southern Europe
subContinent_Western Africa
subContinent_Western Asia
subContinent_Western Europe
deviceCategory_desktop
deviceCategory_mobile
deviceCategory_tablet
isMobile_False
isMobile_True


In [24]:
for col in test_features.columns:
    print('{} - {} '.format(col,type(col)))
    test_features[col] = test_features[col].astype(np.float)


bounces - <class 'str'> 
hits - <class 'str'> 
newVisits - <class 'str'> 
pageviews - <class 'str'> 
visits - <class 'str'> 
subContinent_(not set) - <class 'str'> 
subContinent_Australasia - <class 'str'> 
subContinent_Caribbean - <class 'str'> 
subContinent_Central America - <class 'str'> 
subContinent_Central Asia - <class 'str'> 
subContinent_Eastern Africa - <class 'str'> 
subContinent_Eastern Asia - <class 'str'> 
subContinent_Eastern Europe - <class 'str'> 
subContinent_Melanesia - <class 'str'> 
subContinent_Micronesian Region - <class 'str'> 
subContinent_Middle Africa - <class 'str'> 
subContinent_Northern Africa - <class 'str'> 
subContinent_Northern America - <class 'str'> 
subContinent_Northern Europe - <class 'str'> 
subContinent_Polynesia - <class 'str'> 
subContinent_South America - <class 'str'> 
subContinent_Southeast Asia - <class 'str'> 
subContinent_Southern Africa - <class 'str'> 
subContinent_Southern Asia - <class 'str'> 
subContinent_Southern Europe - <class 's

In [37]:
predictions_revenue = model.predict(test_features)

In [42]:
mse = np.array((test_targets - predictions_revenue)**2).sum()
rmse = np.sqrt(mse)

print('--'*20)
print(f"Mean square error : {mse}")
print(f"Root mean square error : {rmse}")
print('--'*20)


MemoryError: 

In [38]:
sub_df = pd.DataFrame({"fullVisitorId":test_id})
predictions_revenue[predictions_revenue < 0] = 0

In [39]:
sub_df["PredictedLogRevenue"] = np.expm1(predictions_revenue)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]

In [40]:
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])

print('--'*20)
print(sub_df.head())
print('--'*20)


----------------------------------------
         fullVisitorId  PredictedLogRevenue
0  0000000259678714014             0.729345
1  0000049363351866189             0.038042
2  0000053049821714864             0.036615
3  0000059488412965267             0.049500
4  0000085840370633780             0.000000
----------------------------------------


In [41]:
Path = 'c:\\users\\monne\\Desktop'
Name = 'Submission 2018-11-02-Bis.csv'
pathFile = '\\'.join([Path,Name])
sub_df.to_csv(pathFile, index=False)
