In [18]:
import pandas as pd
import numpy as np
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
from pandas.io.json import json_normalize
import json
import os
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

In [25]:
features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device_browser',\
       'device_deviceCategory', 'device_isMobile', 'device_operatingSystem',\
       'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country',\
       'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region',\
       'geoNetwork_subContinent', 'totals_bounces', 'totals_hits',\
       'totals_newVisits', 'totals_pageviews', 'totals_transactionRevenue',\
       'trafficSource_adContent', 'trafficSource_campaign',\
       'trafficSource_isTrueDirect', 'trafficSource_keyword',\
       'trafficSource_medium', 'trafficSource_referralPath',\
       'trafficSource_source','trafficSource_adwordsClickInfo.page',\
       'trafficSource_adwordsClickInfo.adNetworkType',
       'trafficSource_adwordsClickInfo.isVideoAd',
       'trafficSource_adwordsClickInfo.slot',
           ]

In [None]:
gc.enable()
def load_df(csv_path='/home/baitong/pywork/RevenuePrediction/all (1)/train_v2.csv'):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                    chunksize = 100000)
    for df in dfs:
        df.reset_index(drop = True,inplace = True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis = 0).reset_index(drop = True)
        print(ans.shape)
    return ans

train = load_df()
train.shape

Loaded train_v2.csv. Shape: (100000, 59)
(100000, 33)
Loaded train_v2.csv. Shape: (100000, 60)
(200000, 33)
Loaded train_v2.csv. Shape: (100000, 59)
(300000, 33)


In [4]:
path = '/home/baitong/pywork/RevenuePrediction/all (1)'

In [6]:
test = load_df("/home/baitong/pywork/RevenuePrediction/all (1)/test_v2.csv")

Loaded test_v2.csv. Shape: (100000, 59)
(100000, 29)
Loaded test_v2.csv. Shape: (100000, 59)
(200000, 29)
Loaded test_v2.csv. Shape: (100000, 59)
(300000, 29)
Loaded test_v2.csv. Shape: (100000, 59)
(400000, 29)
Loaded test_v2.csv. Shape: (1589, 59)
(401589, 29)


In [7]:
train.to_csv(path +"/train.csv", index=False)
test.to_csv(path+"/test.csv", index = False)
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,...,totals_newVisits,totals_pageviews,totals_transactionRevenue,trafficSource_adContent,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,20171016,3162355547410993243,1508198450,1,1508198450,Firefox,desktop,False,Windows,...,1.0,1,,,(not set),,water bottle,organic,,google
1,Referral,20171016,8934116514970143966,1508176307,6,1508176307,Chrome,desktop,False,Chrome OS,...,,2,,,(not set),,,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com
2,Direct,20171016,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,True,Android,...,1.0,2,,,(not set),True,,(none),,(direct)
3,Organic Search,20171016,9075655783635761930,1508169851,1,1508169851,Chrome,desktop,False,Windows,...,1.0,2,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,6960673291025684308,1508190552,1,1508190552,Chrome,desktop,False,Windows,...,1.0,2,,,(not set),,(not provided),organic,,google


In [11]:
test.shape,train.shape

((401589, 29), (1708337, 29))

In [16]:
from ast import literal_eval
import functools
from multiprocessing import Pool
import logging
import logging
import time
from scipy.stats import stats

def parse(csv_path=path+'/train_v2.csv', nrows=None):

    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    df['hits']=df['hits'].apply(literal_eval)
    df['hits']=df['hits'].str[0]
    df['customDimensions']=df['customDimensions'].apply(literal_eval)
    df['customDimensions']=df['customDimensions'].str[0]
    df=df[pd.notnull(df['customDimensions'])]
    
    
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource','hits','customDimensions']
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df
    
def load_data(nrows=None):
    train_df = parse(nrows=nrows)
    test_df = parse(path+"/test_v2.csv",nrows)
    return train_df,test_df

In [None]:
train_df.head()