In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from pandas.io.json import json_normalize
import json
import os
import plotly.graph_objs as go
from plotly import tools
import plotly.plotly as py
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

In [2]:
def load_df(csv_path='/home/baitong/pywork/RevenuePrediction/all/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
#     print(df.head())
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


In [3]:
%%time
train = load_df()
test = load_df("/home/baitong/pywork/RevenuePrediction/all/test.csv")
train.shape, test.shape

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)
CPU times: user 3min 49s, sys: 3.92 s, total: 3min 52s
Wall time: 3min 52s


In [4]:
#获取不变的常量列，模型无法在常量数据计学到东西，数据与处理时需要drop
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1 ]
train = train.drop(const_cols , axis=1)
test = test.drop(const_cols, axis=1)
####删除test中不存在的列
train = train.drop(["trafficSource.campaignCode"], axis=1)
# train["trafficSource.campaignCode"].astype("float")
train.shape, test.shape

((903653, 35), (804684, 34))

In [5]:
#纯数字型特征
numeric_features_train = train.select_dtypes(include=[np.number])
numeric_features_train.columns

Index(['date', 'visitId', 'visitNumber', 'visitStartTime'], dtype='object')

In [6]:
#类型特征，需要转化为数字特征或者独热编码
categorical_features_train = train.select_dtypes(include=[np.object])
categorical_features_train.columns

Index(['channelGrouping', 'fullVisitorId', 'sessionId', 'device.browser',
       'device.deviceCategory', 'device.operatingSystem', 'geoNetwork.city',
       'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',
       'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source'],
      dtype='object')

In [7]:
##查看缺失值信息
# train.info(),test.info()

In [11]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype("float")

In [12]:
# train_fill.info()

In [13]:
###绘图分析函数
def horizontal_bar_chart(cnt_srs, color):
    trace = go.Bar(
        y=cnt_srs.index[::-1],
        x=cnt_srs.values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [17]:
# Device Browser
cnt_srs = train.groupby('device.browser')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(50, 171, 96, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(50, 171, 96, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(50, 171, 96, 0.6)')

# Device Category
cnt_srs = train.groupby('device.deviceCategory')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace4 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(71, 58, 131, 0.8)')
trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(71, 58, 131, 0.8)')
trace6 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(71, 58, 131, 0.8)')
# Operating system
cnt_srs = train.groupby('device.operatingSystem')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace7 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(246, 78, 139, 0.6)')
trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10),'rgba(246, 78, 139, 0.6)')
trace9 = horizontal_bar_chart(cnt_srs["mean"].head(10),'rgba(246, 78, 139, 0.6)')

# Creating two subplots
fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.04, 
                          subplot_titles=["Device Browser - Count", "Device Browser - Non-zero Revenue Count", "Device Browser - Mean Revenue",
                                          "Device Category - Count",  "Device Category - Non-zero Revenue Count", "Device Category - Mean Revenue", 
                                          "Device OS - Count", "Device OS - Non-zero Revenue Count", "Device OS - Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)
fig.append_trace(trace7, 3, 1)
fig.append_trace(trace8, 3, 2)
fig.append_trace(trace9, 3, 3)

fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Device Plots")
py.iplot(fig, filename='device-plots')


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~BaiTong/0 or inside your plot.ly account where it is named 'device-plots'


In [32]:
cnt_srs = train.groupby('geoNetwork.continent')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(58, 71, 80, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(58, 71, 80, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(58, 71, 80, 0.6)')

# Sub-continent
cnt_srs = train.groupby('geoNetwork.city')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace4 = horizontal_bar_chart(cnt_srs["count"].head(20), 'orange')
trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(20), 'orange')
trace6 = horizontal_bar_chart(cnt_srs["mean"].head(20), 'orange')

# Network domain
cnt_srs = train.groupby('geoNetwork.networkDomain')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace7 = horizontal_bar_chart(cnt_srs["count"].head(20), 'blue')
trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(20), 'blue')
trace9 = horizontal_bar_chart(cnt_srs["mean"].head(20), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["Continent - Count", "Continent - Non-zero Revenue Count", "Continent - Mean Revenue",
                                          "geoNetwork.city- Count",  "country - Non-zero Revenue Count", "country - Mean Revenue",
                                          "Network Domain - Count", "Network Domain - Non-zero Revenue Count", "Network Domain - Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)
fig.append_trace(trace7, 3, 1)
fig.append_trace(trace8, 3, 2)
fig.append_trace(trace9, 3, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Geography Plots")
py.iplot(fig, filename='geo-plots')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]



In [None]:
# train['totals.transactionRevenue'].fillna(0,inplace = True)
# 

In [None]:
# #对缺失数据的数字特征进行填充处理,测试集没有totals.transactionRevenue所以这里没有对totals.transactionRevenue进行处理
# #这里采取的策略是填充0,可以采取平均值填充
# def DealNum_missingdata(df):
#     name_col = ['totals.bounces','totals.hits','totals.newVisits','totals.pageviews','trafficSource.adwordsClickInfo.page']
#     for name in name_col:
#         df[name].fillna(0,inplace= True)
#     return df
# train =DealNum_missingdata(train)
# test =DealNum_missingdata(test)

# # train.info(),test.info()
# #这里仍然没有对种类特征进行填充处理
# print('*')

In [None]:
# def DealCategorical_missingdata(df):
#     name_col = ['trafficSource.adwordsClickInfo.adNetworkType',
#                 'trafficSource.adContent',
#                 'trafficSource.adwordsClickInfo.isVideoAd',
#                 'trafficSource.adwordsClickInfo.slot',
#                 'trafficSource.isTrueDirect',
#                 'trafficSource.keyword',
#                 'trafficSource.referralPath']
#     for name in name_col:
#         df[name].fillna('-',inplace= True)
#     return df
# train = DealCategorical_missingdata(train)
# test = DealCategorical_missingdata(test)
# test_fill =test.drop(['trafficSource.adwordsClickInfo.gclId'],axis=1)
# train_fill = train.drop(['trafficSource.adwordsClickInfo.gclId'],axis=1)
# train_fill['totals.transactionRevenue'] = train_fill['totals.transactionRevenue'].astype("float")

In [33]:
#对种类特征进行分析，种类过于少的可以舍弃，只留取重要种类作为分支
train['channelGrouping'].value_counts()
##Organic Search，Social，Direct，Referral，Paid Search，Affiliates，Display，(Other)
train['device.browser'].value_counts()
##Chrome，Safari，Firefox，Internet Explorer，Edge |截|断| Android Webview，Opera Mini，Opera，UC Browser，YaBrowser
##Coc Coc，Amazon Silk，Android Browser，Mozilla Compatible Agent，MRCHROME，Maxthon，BlackBerry，Nintendo Browser...
train['device.deviceCategory'].value_counts()##desktop,mobile,tablet
train['device.isMobile'].value_counts()##False,True
train['device.operatingSystem'].value_counts()
##Windows,Macintosh,Android,iOS,Linux,Chrome OS,|截|断|(not set),Windows Phone,Samsung,BlackBerry....

##########对于地理位置信息，只需要选择较重要特征就可以。大陆，国家，和城市之间含有的信息重叠部分较多，可以少取一些特征。
train['geoNetwork.continent'].value_counts()######Americas,Africa|截|断|
###Americas，Asia，Europe，Oceania，Africa，(not set)

train['geoNetwork.metro'].value_counts()
##not available in demo dataset,(not set),San Francisco-Oakland-San Jose CA, 
##New York NY ,London,Los Angeles CA,Seattle-Tacoma WA,Chicago IL,Austin TX ,Washington DC (Hagerstown MD)....
train['geoNetwork.networkDomain'].value_counts()
##(not set),unknown.unknown,comcast.net,rr.com,verizon.net,ttnet.com.tr,comcastbusiness.net,
##hinet.net,virginm.net,3bb.co.th....
train['geoNetwork.subContinent'].value_counts()
##Northern America,Southeast Asia,Southern Asia,Western Europe,Northern Europe,Eastern Asia
##Eastern Europe,South America ,Western Asia,Southern Europe,Central America,Australasia
##Northern Africa,Western Africa,Caribbean,Southern Africa,Eastern Africa,Eastern Africa...
print('*')
# train['geoNetwork.region'].value_counts()  ###该属性类型分布过多，需要进一步对类型进行拆分
###California,New York,Texas,Washington,Illinois,Ontario|截|断|

# train['geoNetwork.country'].value_counts() ###该属性类型分布过多，需要进一步对类型进行拆分
#United States,India,United Kingdom,Canada,Vietnam,Japan|截|断|

# train['geoNetwork.city'].value_counts()[0:50]  ###该属性类型分布过多，需要进一步对类型进行拆分
###Mountain View,New York,San Francisco,San Jose,Los Angeles,Chicago,Toronto,Seattle|截|断|

*


not available in demo dataset    508229
Mountain View                     40884
(not set)                         34262
New York                          26371
San Francisco                     20329
Sunnyvale                         13086
London                            12607
San Jose                          10295
Los Angeles                        8670
Bangkok                            7709
Chicago                            7444
Ho Chi Minh City                   7342
Istanbul                           6330
Bengaluru                          5468
Toronto                            5223
Hanoi                              5032
Seattle                            5025
Sydney                             4926
Dublin                             4877
Sao Paulo                          4106
Mumbai                             4099
Chennai                            4090
Paris                              4013
Hyderabad                          3934
Austin                             3790


In [None]:
# test_fill['trafficSource.adwordsClickInfo.adNetworkType'].value_counts()
###-,Google Search，Search partners
test_fill['trafficSource.adwordsClickInfo.isVideoAd'].value_counts()###-,False 这里全为假？？？？
# test_fill[].value_counts()
test_fill['trafficSource.adwordsClickInfo.slot'].value_counts()### -，RHS，Top，Google Display Network
test_fill['trafficSource.campaign'].value_counts()
###(not set)，1000557 | GA | US | en | Hybrid | GDN Text+Banner | AS，
###Data Share Promo，1000557 | GA | US | en | Hybrid | GDN Remarketing
###AW - Dynamic Search Ads Whole Site，Smart Display Campaign，
###AW - Accessories "google + redesign/Accessories March 17" All Users Similar Audiences 
test_fill['trafficSource.isTrueDirect'].value_counts() ### - ，True 这里全为真？？？？
test_fill['trafficSource.keyword'].value_counts()#####感觉没啥用
######-，(not provided)，(User vertical targeting)，(automatic matching)
#####6qEhsCssdK0z36ri，(Remarketing/Content targeting)，1hZbAqLCbjwfgOH7....
test_fill['trafficSource.medium'].value_counts()
###organic,referral,(none),cpc,affiliate,cpm,(not set)
test_fill['trafficSource.referralPath'].value_counts()#####感觉没啥用
#### -, /,/analytics/web/,/yt/about/,/yt/advertise/ .....
test_fill['trafficSource.source'].value_counts()####数据很全，可以进一步画出比较重要的来源
####google,(direct),youtube.com,mall.googleplex.com ,
####analytics.google.com,Partners,gdeals.googleplex.com
###sites.google.com,google.com，googleads.g.doubleclick.net，
###m.facebook.com，reddit.com，bing，baidu.....
print('*')