In [1]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os

import missingno as msno

%matplotlib inline
plt.style.use('ggplot')
from datetime import datetime, timedelta
from sklearn import model_selection, preprocessing, metrics

# function for load data

In [4]:
#https://github.com/abdkumar/Google-Analytics-Customer-Revenue-Prediction/blob/master/customer%20revenue%20prediction.ipynb
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

gc.enable()
features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source']
def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                    chunksize = 50000)
    for df in dfs:
        df.reset_index(drop = True,inplace = True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis = 0).reset_index(drop = True)
        print(ans.shape)
    return ans

In [3]:
path="E:\\data science case studys\\"

In [8]:
%%time
train_df = load_df(path+"train_v2.csv")
test_df = load_df(path+"test_v2.csv")

In [None]:
#not usefull columns
col_drop=["totals.bounces","trafficSource.referralPath","trafficSource.keyword","trafficSource.campaign"]
train=train_df.drop(col_drop,axis=1)
test=test_df.drop(col_drop,axis=1)
train_test_data = pd.concat([train,test], axis=0).reset_index()#concat the train and test data

In [None]:
#change the data types
train_test_data['totals.hits'] = train_test_data['totals.hits'].astype(float)
train_test_data['totals.pageviews'] = train_test_data['totals.pageviews'].astype(float)
train_test_data['totals.newVisits'] = train_test_data['totals.newVisits'].astype(float)
train_test_data['device.isMobile'] = train_test_data['device.isMobile'].astype(bool)
train_test_data['trafficSource.isTrueDirect'] = train_test_data['trafficSource.isTrueDirect'].astype(bool)

In [None]:
#fill the missing values in transactionrevenue columns
train_test_data['totals.transactionRevenue'] = train_test_data['totals.transactionRevenue'].astype(float)
train_test_data['totals.transactionRevenue'].fillna(0, inplace=True)
target = train_test_data['totals.transactionRevenue']

In [9]:
#https://github.com/HuanZhang999/GoogleAnalyticsCustomerRevenuePrediction/blob/master/1-%20create_train.ipynb
def getTimeFramewithFeatures(tr, k=1):
    
    tf = train_test_data.loc[(train_test_data['date'] >= min(train_test_data['date']) + timedelta(days=168*(k-1))) 
              & (train_test_data['date'] < min(train_test_data['date']) + timedelta(days=168*k))]

    tf_fvid = set(train_test_data.loc[(train_test_data['date'] >= min(train_test_data['date']) + timedelta(days=168*k + 46 )) 
                       & (train_test_data['date'] < min(train_test_data['date']) + timedelta(days=168*k + 46 + 62))]['fullVisitorId'])

    tf_returned = tf[tf['fullVisitorId'].isin(tf_fvid)]
    
    tf_tst = train_test_data[train_test_data['fullVisitorId'].isin(set(tf_returned['fullVisitorId']))
             & (train_test_data['date'] >= min(train_test_data['date']) + timedelta(days=168*k + 46))
             & (train_test_data['date'] < min(train_test_data['date']) + timedelta(days=168*k + 46 + 62))]
    
    
    tf_target = tf_tst.groupby('fullVisitorId')[['totals.transactionRevenue']].sum().apply(np.log1p, axis=1).reset_index()
    tf_target['ret'] = 1
    tf_target.rename(columns={'totals.transactionRevenue': 'target'}, inplace=True)
    
    tf_nonret = pd.DataFrame()
    tf_nonret['fullVisitorId'] = list(set(tf['fullVisitorId']) - tf_fvid)    
    tf_nonret['target'] = 0
    tf_nonret['ret'] = 0
    
    tf_target = pd.concat([tf_target, tf_nonret], axis=0).reset_index(drop=True)
    
    tf_maxdate = max(tf['date'])
    tf_mindate = min(tf['date'])

    tf = tf.groupby('fullVisitorId').agg({
            'geoNetwork.networkDomain': {'networkDomain': lambda x: x.dropna().max()},
            'geoNetwork.city': {'city': lambda x: x.dropna().max()},
            'device.operatingSystem': {'operatingSystem': lambda x: x.dropna().max()},
            'geoNetwork.metro': {'metro': lambda x: x.dropna().max()},
            'geoNetwork.region': {'region': lambda x: x.dropna().max()},
            'channelGrouping': {'channelGrouping': lambda x: x.dropna().max()},
    
            'geoNetwork.country': {'country': lambda x: x.dropna().max()},
            'trafficSource.source': {'source': lambda x: x.dropna().max()},
            'trafficSource.medium': {'medium': lambda x: x.dropna().max()},
            'device.browser':  {'browser': lambda x: x.dropna().max()},
            'device.deviceCategory': {'deviceCategory': lambda x: x.dropna().max()},
            'geoNetwork.continent': {'continent': lambda x: x.dropna().max()},
            'totals.pageviews': {'pageviews_sum': lambda x: x.dropna().sum(),
                                 'pageviews_min': lambda x: x.dropna().min(), 
                                 'pageviews_max': lambda x: x.dropna().max(),
                                 'pageviews_mean': lambda x: x.dropna().mean()},
            'totals.hits': {'hits_sum': lambda x: x.dropna().sum(), 
                            'hits_min': lambda x: x.dropna().min(), 
                            'hits_max': lambda x: x.dropna().max(), 
                            'hits_mean': lambda x: x.dropna().mean()},
    
            'visitStartTime': {'visitStartTime_counts': lambda x: x.dropna().count()},
            'trafficSource.isTrueDirect': {'isTrueDirect': lambda x: x.dropna().max()},
            'totals.newVisits': {'newVisits_max': lambda x: x.dropna().max()},
            'device.isMobile': {'isMobile': lambda x: x.dropna().max()},
            'visitNumber': {'visitNumber_max' : lambda x: x.dropna().max()},
    
            'totals.transactionRevenue':  {'transactionRevenue_sum':  lambda x:x.dropna().sum()},
            
            'date': {'first_ses_from_the_period_start': lambda x: x.dropna().min() - tf_mindate,
                     
                     'last_ses_from_the_period_end': lambda x: tf_maxdate - x.dropna().max(),
                     
                     'interval_dates': lambda x: x.dropna().max() - x.dropna().min(),
                     
                     'unqiue_date_num': lambda x: len(set(x.dropna())) },            
                    })

    tf.columns = tf.columns.droplevel()

    tf = pd.merge(tf, tf_target, left_on='fullVisitorId', right_on='fullVisitorId')
    return tf

In [None]:
print('Get 1st train part...')
tr1 = getTimeFramewithFeatures(train_test_data, k=1)
tr1.to_pickle(path+"new_tr1_clean")#save the data into pickle

In [None]:
print('Get 1st train part...')
tr2 = getTimeFramewithFeatures(train_test_data, k=2)
tr2.to_pickle(path+"new_tr2_clean")#save the data into pickle

In [None]:
print('Get 1st train part...')
tr3 = getTimeFramewithFeatures(train_test_data, k=3)
tr3.to_pickle(path+"new_tr3_clean")#save the data into pickle

In [None]:
print('Get 1st train part...')
tr4 = getTimeFramewithFeatures(train_test_data, k=4)
tr4.to_pickle(path+"new_tr4_clean")#save the data into pickle

In [None]:
### Construction of the test-set (by analogy as train-set)
print('Get test')
tr5 = train_test_data[train_test_data['date'] >= pd.to_datetime(20180501, infer_datetime_format=True, format="%Y%m%d")]
tr5_maxdate = max(tr5['date'])
tr5_mindate = min(tr5['date'])

In [None]:
%%time
tr5 = tr5.groupby('fullVisitorId').agg({
            'geoNetwork.networkDomain': {'networkDomain': lambda x: x.dropna().max()},
            'geoNetwork.city': {'city': lambda x: x.dropna().max()},
            'device.operatingSystem': {'operatingSystem': lambda x: x.dropna().max()},
            'geoNetwork.metro': {'metro': lambda x: x.dropna().max()},
            'geoNetwork.region': {'region': lambda x: x.dropna().max()},
            'channelGrouping': {'channelGrouping': lambda x: x.dropna().max()},
    
            'geoNetwork.country': {'country': lambda x: x.dropna().max()},
            'trafficSource.source': {'source': lambda x: x.dropna().max()},
            'trafficSource.medium': {'medium': lambda x: x.dropna().max()},
            'device.browser':  {'browser': lambda x: x.dropna().max()},
            'device.deviceCategory': {'deviceCategory': lambda x: x.dropna().max()},
            'geoNetwork.continent': {'continent': lambda x: x.dropna().max()},
            'totals.pageviews': {'pageviews_sum': lambda x: x.dropna().sum(),
                                 'pageviews_min': lambda x: x.dropna().min(), 
                                 'pageviews_max': lambda x: x.dropna().max(),
                                 'pageviews_mean': lambda x: x.dropna().mean()},
            'totals.hits': {'hits_sum': lambda x: x.dropna().sum(), 
                            'hits_min': lambda x: x.dropna().min(), 
                            'hits_max': lambda x: x.dropna().max(), 
                            'hits_mean': lambda x: x.dropna().mean()},
    
            'visitStartTime': {'visitStartTime_counts': lambda x: x.dropna().count()},
            'trafficSource.isTrueDirect': {'isTrueDirect': lambda x: x.dropna().max()},
            'totals.newVisits': {'newVisits_max': lambda x: x.dropna().max()},
            'device.isMobile': {'isMobile': lambda x: x.dropna().max()},
            'visitNumber': {'visitNumber_max' : lambda x: x.dropna().max()},
    
            'totals.transactionRevenue':  {'transactionRevenue_sum':  lambda x:x.dropna().sum()},
            
            'date': {'first_ses_from_the_period_start': lambda x: x.dropna().min() - tf_mindate,
                     
                     'last_ses_from_the_period_end': lambda x: tf_maxdate - x.dropna().max(),
                     
                     'interval_dates': lambda x: x.dropna().max() - x.dropna().min(),
                     
                     'unqiue_date_num': lambda x: len(set(x.dropna())) },            
                    })
tr5.columns = tr5.columns.droplevel()
tr5['target'] = np.nan
tr5['ret'] = np.nan'''

In [None]:
#tr5.shape
tr5.to_pickle(path+"new_tr5_clean")

## load the pickle files

In [32]:
%%time
new_tr1 = pd.read_pickle(path+"new_tr1_clean")

Wall time: 3.58 s


In [33]:
new_tr2 = pd.read_pickle(path+"new_tr2_clean")

In [34]:
new_tr3 = pd.read_pickle(path+"new_tr3_clean")

In [35]:
new_tr4 = pd.read_pickle(path+"new_tr4_clean")

In [36]:
new_tr5 = pd.read_pickle(path+"new_tr5_clean")

## Concat all the train data set

In [37]:
train_all = pd.concat([new_tr1,new_tr2,new_tr3,new_tr4,new_tr5], axis=0, sort=False).reset_index(drop=True)
train_all['interval_dates'] = train_all['interval_dates'].dt.days#fatch the day form date 
train_all['first_ses_from_the_period_start'] = train_all['first_ses_from_the_period_start'].dt.days
train_all['last_ses_from_the_period_end'] = train_all['last_ses_from_the_period_end'].dt.days

In [38]:
train_all.shape

(1819164, 61)

In [40]:
fullVisitorId_new=new_tr5.index

In [41]:
def str_to_bool(s):
    if s == 'True':
         return True
    elif s == 'False':
         return False
    else:
         return False  #ValueError # evil ValueError that doesn't tell you what the wrong value was

In [42]:
train_all["isTrueDirect"]=train_all["isTrueDirect"].apply(lambda x : str_to_bool(x))
train_all["isMobile"]=train_all["isMobile"].apply(lambda x : str_to_bool(x))
train_all["device.isMobile"]=train_all["device.isMobile"].apply(lambda x : str_to_bool(x))
train_all["trafficSource.isTrueDirect"]=train_all["trafficSource.isTrueDirect"].apply(lambda x : str_to_bool(x))

In [44]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def final_fun_1(train_all):
    #train_all.isnull().sum()
    train_data_y = train_all['target']#store the targets 
    train_data_ret = train_all['ret']
    #-----------------
    not_use=["target","ret","date"]
    #fatch only categorical data
    categorical_cols = list()
    for i in train_all.columns:
        if (train_all[i].dtype=='object' or train_all[i].dtype=='bool') :
            categorical_cols.append(i)
      #categorical_cols.remove("fullVisitorId")
    categorical_cols.remove("isTrueDirect")
    categorical_cols.remove("isMobile")
    categorical_cols.remove("device.isMobile")
    categorical_cols.remove("trafficSource.isTrueDirect")        
    #fatch only numerical data
    num_cols = list()
    for i in train_all.columns:
        if train_all[i].dtype not in ['object', 'bool']:
            num_cols.append(i)
    #num_cols        
    numerical_columns = [c for c in num_cols if c not in not_use]
    #-------------------------------
    # fillmissing 
    for col in categorical_cols:
        train_all[col].fillna('missing', inplace=True) 

    for col in numerical_columns:       
        train_all[col].fillna(0, inplace=True)
    #------------------------------------
    scaler = preprocessing.StandardScaler()
    # Fit your data on the scaler object
    scaled_df = scaler.fit_transform(train_all[numerical_columns])
    scaled_df = pd.DataFrame(scaled_df, columns=numerical_columns)
    train_all[numerical_columns] = scaled_df
    #------------------------------------------
    train = train_all[train_all['target'].notnull()]
    test = train_all[train_all['target'].isnull()]
    #------------------------------------------
    for col in categorical_cols:
    # Using whole data to form an exhaustive list of levels
        data=train[col].append(test[col])
        le.fit(data.values)
        train[col]=le.transform(train[col])
        test[col]=le.transform(test[col])  
    #----------------------------------------
    train_data_y = np.array(train_data_y)
    train_data_ret = np.array(train_data_ret)
    #--------------------------------
    #--------------------------------
    
    return train_data_y,train_data_ret,train,test

In [17]:
#train_all.columns

Index(['fullVisitorId', 'networkDomain', 'city', 'operatingSystem', 'metro',
       'region', 'channelGrouping', 'country', 'source', 'medium', 'browser',
       'deviceCategory', 'continent', 'pageviews_sum', 'pageviews_min',
       'pageviews_max', 'pageviews_mean', 'hits_sum', 'hits_min', 'hits_max',
       'hits_mean', 'visitStartTime_counts', 'isTrueDirect', 'newVisits_max',
       'isMobile', 'visitNumber_max', 'transactionRevenue_sum',
       'first_ses_from_the_period_start', 'last_ses_from_the_period_end',
       'interval_dates', 'unqiue_date_num', 'target', 'ret', 'index', 'date',
       'visitId', 'visitNumber', 'visitStartTime', 'device.browser',
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.hits', 'totals.newVisits',
       'totals.pageviews', 'totals.tra

In [45]:
train_data_y,train_data_ret,train,test=final_fun_1train_all)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [49]:
import lightgbm as lgb
def final_fun_2(train_data_y,train_data_ret,train,test,fullVisitorId_new):
    params_lgb2 = {
        "objective" : "regression",
        "metric" : "rmse", 
        "max_leaves": 256,
        "num_leaves" : 9,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1      
    }
    params_lgb1 = {
        "objective" : "binary",
        "metric" : "binary_logloss",
        "max_leaves": 256,
        "num_leaves" : 15,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1           
    }
    #---------------------------------------------------
    target_cols = ['target', 'ret', 'fullVisitorId',"date"]

    dtrain = lgb.Dataset(train.drop(target_cols, axis=1), label=train['ret'])

    dtrain_ret = lgb.Dataset(train.drop(target_cols, axis=1)[train['ret']==1], 
                         label=train['target'][train['ret']==1])
    pr_lgb_sum = 0
    #----------------------------------------------------
    print('Training and predictions')
    for i in range(10):
        print('Interation number ', i)
        lgb_model1 = lgb.train(params_lgb1, dtrain, num_boost_round=1200)
        pr_lgb = lgb_model1.predict(test.drop(target_cols, axis=1))

        lgb_model2 = lgb.train(params_lgb2, dtrain_ret, num_boost_round=368)
        pr_lgb_ret = lgb_model2.predict(test.drop(target_cols, axis=1))

        pr_lgb_sum = pr_lgb_sum + pr_lgb*pr_lgb_ret

    pr_final_lgb = pr_lgb_sum/10
    #-------------------------------
    new_data=pd.DataFrame()
    new_data["fullVisitorId"]=fullVisitorId_new
    new_data["PredictedLogRevenue"]=pr_final_lgb
    new_data.to_csv(path+"new1_baseline.csv", index=False)
    return new_data
    

In [50]:
final=final_fun_2(train_data_y,train_data_ret,train,test,fullVisitorId_new)

Training and predictions
Interation number  0
Interation number  1
Interation number  2
Interation number  3
Interation number  4
Interation number  5
Interation number  6
Interation number  7
Interation number  8
Interation number  9


Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,1708337,0.000972
1,1708338,0.001002
2,1708339,0.000972
3,1708340,0.001002
4,1708341,0.000972
