In [33]:
import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import functools
from multiprocessing import Pool
import logging
import gc
import matplotlib.pyplot as plt
import time
from scipy.stats import stats
from ast import literal_eval
import warnings
import os
from pandas.core.common import SettingWithCopyWarning
import ast
import pandas.io.json as pdjson
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

In [60]:
features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device_browser',\
       'device_deviceCategory', 'device_isMobile', 'device_operatingSystem',\
       'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country',\
       'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region',\
       'geoNetwork_subContinent', 'totals_bounces', 'totals_hits',\
       'totals_newVisits', 'totals_pageviews', 'totals_transactionRevenue',\
       'trafficSource_adContent', 'trafficSource_campaign',\
       'trafficSource_isTrueDirect', 'trafficSource_keyword',\
       'trafficSource_medium', 'trafficSource_referralPath',\
       'trafficSource_source','trafficSource_adwordsClickInfo.page',\
       'trafficSource_adwordsClickInfo.adNetworkType',
       'trafficSource_adwordsClickInfo.isVideoAd',
       'trafficSource_adwordsClickInfo.slot',
           ]

In [10]:
gc.enable()
def load_df(csv_path='/home/baitong/pywork/RevenuePrediction/all (1)/train_v2.csv'):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                    chunksize = 100000)
    count = 0
    for df in dfs:
        count+=1
        df.reset_index(drop = True,inplace = True)       
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis = 0).reset_index(drop = True)
        print(ans.shape)
        if count==1:
            break
    return ans


In [38]:
train = load_df()
train.shape

Loaded train_v2.csv. Shape: (100000, 59)
(100000, 59)


(100000, 59)

In [39]:
#获取不变的常量列，模型无法在常量数据计学到东西，数据与处理时需要drop
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1 ]
print(const_cols)

['socialEngagementType', 'device_browserSize', 'device_browserVersion', 'device_flashVersion', 'device_language', 'device_mobileDeviceBranding', 'device_mobileDeviceInfo', 'device_mobileDeviceMarketingName', 'device_mobileDeviceModel', 'device_mobileInputSelector', 'device_operatingSystemVersion', 'device_screenColors', 'device_screenResolution', 'geoNetwork_cityId', 'geoNetwork_latitude', 'geoNetwork_longitude', 'geoNetwork_networkLocation', 'totals_visits', 'trafficSource_adwordsClickInfo.criteriaParameters']


In [40]:
train = train.drop(const_cols , axis=1)

In [41]:
def parseData(df):
    df['hits']=df['hits'].apply(ast.literal_eval)
    df['hits']=df['hits'].str[0]
    df['hits']=df['hits'].apply(lambda x: {'index':np.NaN,'value':np.NaN} if pd.isnull(x) else x)
    
    df['customDimensions']=df['customDimensions'].apply(ast.literal_eval)
    df['customDimensions']=df['customDimensions'].str[0]
    df['customDimensions']=df['customDimensions'].apply(lambda x: {'index':np.NaN,'value':np.NaN} if pd.isnull(x) else x)
    
    JSON_COLUMNS = ['hits','customDimensions']
    for column in JSON_COLUMNS:
        column_as_df = pdjson.json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
        print("'parse' function to flatten JSON columns :",column)
    return df

In [42]:
train = parseData(train)

'parse' function to flatten JSON columns : hits
'parse' function to flatten JSON columns : customDimensions


In [63]:
jsonlist=[]
for i in range(len(train.columns)):   # for each column
    if (isinstance(train.iloc[1,i], list) ):  # see if some element 1 is a list
        jsonlist.append( train.columns[i] )   # if yes, then save name to list
print(jsonlist)

['hits_customDimensions', 'hits_customMetrics', 'hits_customVariables', 'hits_experiment', 'hits_product', 'hits_promotion', 'hits_publisher_infos']


In [91]:
print("Printout for each column's number of unique values (incl. nans)\n")
for col in train.columns:
    try:
        print(col, ':', train[col].nunique(dropna=False))
    except TypeError:
        a=train[col].astype('str')
        #print(a)
        print( col, ':', a.nunique(dropna=False), ' >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LIST')
# Clean workspace
del(col)

Printout for each column's number of unique values (incl. nans)

channelGrouping : 8
date : 38
fullVisitorId : 90097
visitId : 98072
visitNumber : 210
visitStartTime : 98062
device_browser : 35
device_deviceCategory : 3
device_isMobile : 2
device_operatingSystem : 18
geoNetwork_city : 318
geoNetwork_continent : 6
geoNetwork_country : 188
geoNetwork_metro : 54
geoNetwork_networkDomain : 7337
geoNetwork_region : 230
geoNetwork_subContinent : 23
totals_bounces : 2
totals_hits : 151
totals_newVisits : 2
totals_pageviews : 116
totals_sessionQualityDim : 97
totals_timeOnSite : 2411
totals_totalTransactionRevenue : 809
totals_transactionRevenue : 688
totals_transactions : 6
trafficSource_adContent : 55
trafficSource_adwordsClickInfo.adNetworkType : 3
trafficSource_adwordsClickInfo.isVideoAd : 2
trafficSource_adwordsClickInfo.page : 7
trafficSource_adwordsClickInfo.slot : 4
trafficSource_campaign : 27
trafficSource_isTrueDirect : 2
trafficSource_keyword : 349
trafficSource_medium : 7
trafficSo

In [65]:
print('Data shape before dropping constant columns:', train.shape)

print('\nColumns being dropped:')

for col in train.columns:
    try:
        if (train[col].nunique(dropna=False) == 1):
            del(train[col])
            print(col)
    except TypeError:
        a=train[col].astype('str')
        if (a.nunique(dropna=False) == 1):
            del(train[col])
            print(col)
del(col)

print('\ndata shape is now:', train.shape)

Data shape before dropping constant columns: (100000, 110)

Columns being dropped:
hits_index
hits_value

data shape is now: (100000, 108)


In [67]:
train = train.drop(labels=['hits_product'], axis=1)
train = train.drop(labels=['hits_promotion'], axis=1)

In [72]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 106 entries, channelGrouping to customDimensions_value
dtypes: bool(1), int64(4), object(101)
memory usage: 80.2+ MB


In [96]:
print("Printout for each column's number of unique values (incl. nans)\n")
for col in train.columns:
    try:
        print(col)
    except TypeError:
        a=train[col].astype('str')
        #print(a)
        print( col)
# Clean workspace
del(col)

Printout for each column's number of unique values (incl. nans)

channelGrouping
date
fullVisitorId
visitId
visitNumber
visitStartTime
device_browser
device_deviceCategory
device_isMobile
device_operatingSystem
geoNetwork_city
geoNetwork_continent
geoNetwork_country
geoNetwork_metro
geoNetwork_networkDomain
geoNetwork_region
geoNetwork_subContinent
totals_bounces
totals_hits
totals_newVisits
totals_pageviews
totals_sessionQualityDim
totals_timeOnSite
totals_totalTransactionRevenue
totals_transactionRevenue
totals_transactions
trafficSource_adContent
trafficSource_adwordsClickInfo.adNetworkType
trafficSource_adwordsClickInfo.isVideoAd
trafficSource_adwordsClickInfo.page
trafficSource_adwordsClickInfo.slot
trafficSource_campaign
trafficSource_isTrueDirect
trafficSource_keyword
trafficSource_medium
trafficSource_referralPath
trafficSource_source
hits_appInfo.exitScreenName
hits_appInfo.landingScreenName
hits_appInfo.screenDepth
hits_appInfo.screenName
hits_dataSource
hits_eCommerceAction.

In [97]:
# 'trafficSource_adwordsClickInfo.gclId',
# useless_feature = ['hits_contentGroup.previousContentGroup1','hits_contentGroup.previousContentGroup2',
# 'hits_contentGroup.previousContentGroup3','hits_contentGroup.previousContentGroup4',
# 'hits_contentGroup.previousContentGroup5','hits_customDimensions',
# 'hits_customMetrics','hits_customVariables',            
#                   ]
# useless_feature = ['hits_latencyTracking.domContentLoadedTime',
# 'hits_latencyTracking.domInteractiveTime',
# 'hits_latencyTracking.domLatencyMetricsSample',
# 'hits_latencyTracking.domainLookupTime',
# 'hits_latencyTracking.pageDownloadTime',
# 'hits_latencyTracking.pageLoadSample',
# 'hits_latencyTracking.pageLoadTime',
# 'hits_latencyTracking.redirectionTime',
# 'hits_latencyTracking.serverConnectionTime',
# 'hits_latencyTracking.serverResponseTime',
# 'hits_latencyTracking.speedMetricsSample']
# useless_feature = ['hits_publisher_infos','hits_contentGroup.contentGroup1',
# 'hits_contentGroup.contentGroup2',
# 'hits_contentGroup.contentGroup3',
# 'hits_contentGroup.contentGroup4',
# 'hits_contentGroup.contentGroup5',
# 'hits_contentGroup.contentGroupUniqueViews1',
# 'hits_contentGroup.contentGroupUniqueViews2',
# 'hits_contentGroup.contentGroupUniqueViews3']
useless_feature = ['hits_eventInfo.eventAction','hits_eventInfo.eventCategory','hits_eventInfo.eventLabel',
                   'hits_experiment'
                  ]

In [98]:
train = train.drop(labels=useless_feature, axis=1)

In [99]:
train.to_csv("/home/baitong/pywork/RevenuePrediction/train001.csv",index = False)

In [100]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,...,hits_promotionActionInfo.promoIsView,hits_referer,hits_social.hasSocialSourceReferral,hits_social.socialInteractionNetworkAction,hits_social.socialNetwork,hits_time,hits_transaction.currencyCode,hits_type,customDimensions_index,customDimensions_value
0,Organic Search,20171016,3162355547410993243,1508198450,1,1508198450,Firefox,desktop,False,Windows,...,,https://www.google.co.uk/search?q=water+bottle...,No,:,(not set),0,USD,PAGE,4,EMEA
1,Referral,20171016,8934116514970143966,1508176307,6,1508176307,Chrome,desktop,False,Chrome OS,...,True,https://sites.google.com/a/google.com/transpor...,No,:,(not set),0,,PAGE,4,North America
2,Direct,20171016,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,True,Android,...,True,https://www.googlemerchandisestore.com/,No,:,(not set),0,,PAGE,4,North America
3,Organic Search,20171016,9075655783635761930,1508169851,1,1508169851,Chrome,desktop,False,Windows,...,True,https://www.google.com.tr/,No,:,(not set),0,,PAGE,4,EMEA
4,Organic Search,20171016,6960673291025684308,1508190552,1,1508190552,Chrome,desktop,False,Windows,...,True,https://www.google.com.mx/,No,:,(not set),0,,PAGE,4,Central America


In [101]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 73 columns):
channelGrouping                                 100000 non-null object
date                                            100000 non-null int64
fullVisitorId                                   100000 non-null object
visitId                                         100000 non-null int64
visitNumber                                     100000 non-null int64
visitStartTime                                  100000 non-null int64
device_browser                                  100000 non-null object
device_deviceCategory                           100000 non-null object
device_isMobile                                 100000 non-null bool
device_operatingSystem                          100000 non-null object
geoNetwork_city                                 100000 non-null object
geoNetwork_continent                            100000 non-null object
geoNetwork_country                          