In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from pandas.io.json import json_normalize
import json
import os
import plotly.graph_objs as go
from plotly import tools
import plotly.plotly as py
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

In [2]:
def load_df(csv_path='/home/baitong/pywork/RevenuePrediction/all/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
#     print(df.head())
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


In [3]:
path = '/home/baitong/pywork/RevenuePrediction/'

In [4]:
%%time
train = load_df()
test = load_df("/home/baitong/pywork/RevenuePrediction/all/test.csv")
train.shape, test.shape

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)
CPU times: user 3min 50s, sys: 4.24 s, total: 3min 54s
Wall time: 4min 3s


In [5]:
#获取不变的常量列，模型无法在常量数据计学到东西，数据与处理时需要drop
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1 ]
train = train.drop(const_cols , axis=1)
test = test.drop(const_cols, axis=1)
####删除test中不存在的列
train = train.drop(["trafficSource.campaignCode"], axis=1)
# train["trafficSource.campaignCode"].astype("float")
train.shape, test.shape

((903653, 35), (804684, 34))

In [6]:
#纯数字型特征
numeric_features_train = train.select_dtypes(include=[np.number])
numeric_features_train.columns

Index(['date', 'visitId', 'visitNumber', 'visitStartTime'], dtype='object')

In [7]:
#类型特征，需要转化为数字特征或者独热编码
categorical_features_train = train.select_dtypes(include=[np.object])
categorical_features_train.columns

Index(['channelGrouping', 'fullVisitorId', 'sessionId', 'device.browser',
       'device.deviceCategory', 'device.operatingSystem', 'geoNetwork.city',
       'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',
       'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source'],
      dtype='object')

In [8]:
##查看缺失值信息
# train.info(),test.info()

In [9]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype("float")

In [10]:
# train_fill.info()

In [11]:
###绘图分析函数
def horizontal_bar_chart(cnt_srs, color):
    trace = go.Bar(
        y=cnt_srs.index[::-1],
        x=cnt_srs.values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [12]:
# Device Browser
cnt_srs = train.groupby('device.browser')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(50, 171, 96, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(50, 171, 96, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(50, 171, 96, 0.6)')

# Device Category
cnt_srs = train.groupby('geoNetwork.city')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace4 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(71, 58, 131, 0.8)')
trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(71, 58, 131, 0.8)')
trace6 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(71, 58, 131, 0.8)')
# Operating system
cnt_srs = train.groupby('device.operatingSystem')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace7 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(246, 78, 139, 0.6)')
trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10),'rgba(246, 78, 139, 0.6)')
trace9 = horizontal_bar_chart(cnt_srs["mean"].head(10),'rgba(246, 78, 139, 0.6)')

# Creating two subplots
fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.04, 
                          subplot_titles=["Device Browser - Count", "Device Browser - Non-zero Revenue Count", "Device Browser - Mean Revenue",
                                          "city - Count",  "Device Category - Non-zero Revenue Count", "Device Category - Mean Revenue", 
                                          "Device OS - Count", "Device OS - Non-zero Revenue Count", "Device OS - Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)
fig.append_trace(trace7, 3, 1)
fig.append_trace(trace8, 3, 2)
fig.append_trace(trace9, 3, 3)

fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Device Plots")
py.iplot(fig, filename='device-plots')
print('*')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~BaiTong/0 or inside your plot.ly account where it is named 'device-plots'
*


In [13]:
cnt_srs = train.groupby('geoNetwork.subContinent')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(20), 'rgba(58, 71, 80, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(20), 'rgba(58, 71, 80, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(20), 'rgba(58, 71, 80, 0.6)')

# Sub-continent
cnt_srs = train.groupby('geoNetwork.metro')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace4 = horizontal_bar_chart(cnt_srs["count"].head(20), 'orange')
trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(20), 'orange')
trace6 = horizontal_bar_chart(cnt_srs["mean"].head(20), 'orange')

# Network domain
cnt_srs = train.groupby('trafficSource.adContent')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace7 = horizontal_bar_chart(cnt_srs["count"].head(20), 'blue')
trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(20), 'blue')
trace9 = horizontal_bar_chart(cnt_srs["mean"].head(20), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["subContinent - Count", "Continent - Non-zero Revenue Count", "Continent - Mean Revenue",
                                          "geoNetwork.metro- Count",  "country - Non-zero Revenue Count", "country - Mean Revenue",
                                          "adContent - Count", "Network Domain - Non-zero Revenue Count", "Network Domain - Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)
fig.append_trace(trace7, 3, 1)
fig.append_trace(trace8, 3, 2)
fig.append_trace(trace9, 3, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Geography Plots")
py.iplot(fig, filename='geo-plots')
# print('*')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]



In [14]:

# train['trafficSource.adContent'].value_counts()
# 

In [15]:
#对种类特征进行分析，种类过于少的可以舍弃，只留取重要种类作为分支
train['channelGrouping'].value_counts()
##Organic Search，Social，Direct，Referral，Paid Search，Affiliates，Display，(Other)
train['device.browser'].value_counts()
##Chrome，Safari，Firefox，Internet Explorer，Edge |截|断| Android Webview，Opera Mini，Opera，UC Browser，YaBrowser
##Coc Coc，Amazon Silk，Android Browser，Mozilla Compatible Agent，MRCHROME，Maxthon，BlackBerry，Nintendo Browser...
train['device.deviceCategory'].value_counts()##desktop,mobile,tablet
train['device.isMobile'].value_counts()##False,True
train['device.operatingSystem'].value_counts()
##Windows,Macintosh,Android,iOS,Linux,Chrome OS,|截|断|(not set),Windows Phone,Samsung,BlackBerry....

##########对于地理位置信息，只需要选择较重要特征就可以。大陆，国家，和城市之间含有的信息重叠部分较多，可以少取一些特征。
train['geoNetwork.continent'].value_counts()######Americas,Africa|截|断|
###Americas，Asia，Europe，Oceania，Africa，(not set)

train['geoNetwork.metro'].value_counts()
##San Francisco-Oakland-San Jose CA,New York NY,Roanoke-Lynchburg VA,|截|断|not available in demo dataset,(not set),
##London,Los Angeles CA,Seattle-Tacoma WA,Chicago IL,Austin TX ,Washington DC (Hagerstown MD)....
train['geoNetwork.networkDomain'].value_counts()
##comcastbusiness.net,|截|断|(not set),unknown.unknown,comcast.net,rr.com,verizon.net,ttnet.com.tr,,
##hinet.net,virginm.net,3bb.co.th....
train['geoNetwork.subContinent'].value_counts()
##Northern America,|截|断|Southeast Asia,Southern Asia,Western Europe,Northern Europe,Eastern Asia
##Eastern Europe,South America ,Western Asia,Southern Europe,Central America,Australasia
##Northern Africa,Western Africa,Caribbean,Southern Africa,Eastern Africa,Eastern Africa...

train['geoNetwork.region'].value_counts()  ###该属性类型分布过多，需要进一步对类型进行拆分
###California,New York,Texas,Washington,Illinois,Ontario|截|断|

train['geoNetwork.country'].value_counts() ###该属性类型分布过多，需要进一步对类型进行拆分
#United States,India,United Kingdom,Canada,Vietnam,Japan|截|断|

train['geoNetwork.city'].value_counts()[0:50]  ###该属性类型分布过多，需要进一步对类型进行拆分
###Mountain View,New York,San Francisco,San Jose,Los Angeles,Chicago,Toronto,Seattle|截|断|
print('*')

*


In [16]:
# test_fill['trafficSource.adwordsClickInfo.adNetworkType'].value_counts()
###-,Google Search，Search partners
train['trafficSource.adwordsClickInfo.isVideoAd'].value_counts()###-,False 这里全为假？？？？
# test_fill[].value_counts()
train['trafficSource.adwordsClickInfo.slot'].value_counts()### -，RHS，Top，Google Display Network
train['trafficSource.campaign'].value_counts()
###(not set)，1000557 | GA | US | en | Hybrid | GDN Text+Banner | AS，
###Data Share Promo，1000557 | GA | US | en | Hybrid | GDN Remarketing
###AW - Dynamic Search Ads Whole Site，Smart Display Campaign，
###AW - Accessories "google + redesign/Accessories March 17" All Users Similar Audiences 
train['trafficSource.isTrueDirect'].value_counts() ### - ，True 这里全为真？？？？
train['trafficSource.keyword'].value_counts()
######Google Merchandise,+Google +Merchandise,|截|断|可以用正则表达式处理
train['trafficSource.medium'].value_counts()
###organic,referral,cpc,cpm,|截|断|
train['trafficSource.referralPath'].value_counts()#####感觉没啥用
####/,/yt/about/,/mail/u/0/,|截|断|
train['trafficSource.source'].value_counts()####数据很全，可以进一步画出比较重要的来源
####google,(direct),youtube.com,mall.googleplex.com,dfa |截|断|
print('*')

*


In [17]:
# Continent
cnt_srs = train.groupby('trafficSource.keyword')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(30), 'green')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(30), 'green')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(30), 'green')

# Sub-continent
cnt_srs = train.groupby('trafficSource.campaign')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace4 = horizontal_bar_chart(cnt_srs["count"], 'purple')
trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"], 'purple')
trace6 = horizontal_bar_chart(cnt_srs["mean"], 'purple')

# Creating two subplots
fig = tools.make_subplots(rows=2, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["keyword - Count", "Traffic Source - Non-zero Revenue Count", "Traffic Source - Mean Revenue",
                                          "'trafficSource.campaign' - Count",  "Traffic Source Medium - Non-zero Revenue Count", "Traffic Source Medium - Mean Revenue"
                                          ])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)

fig['layout'].update(height=1000, width=1200, paper_bgcolor='rgb(233,233,233)', title="Traffic Source Plots")
py.iplot(fig, filename='traffic-source-plots')
# print('*')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]



In [18]:
# train['trafficSource.campaign'].value_counts()

In [19]:
#对缺失数据的数字特征进行填充处理,测试集没有totals.transactionRevenue所以这里没有对totals.transactionRevenue进行处理
#这里采取的策略是填充0,可以采取平均值填充
def DealNum_missingdata(df):
    name_col = ['totals.bounces','totals.hits','totals.newVisits','totals.pageviews','trafficSource.adwordsClickInfo.page']
    for name in name_col:
        df[name].fillna(0,inplace= True)
    return df
train['totals.transactionRevenue'].fillna(0,inplace = True)
train =DealNum_missingdata(train)
test =DealNum_missingdata(test)


#这里仍然没有对种类特征进行填充处理
# print('*')

In [20]:
def DealCategorical_missingdata(df):
    name_col = ['trafficSource.adwordsClickInfo.adNetworkType',
                'trafficSource.adContent',
                'trafficSource.adwordsClickInfo.isVideoAd',
                'trafficSource.adwordsClickInfo.slot',
                'trafficSource.isTrueDirect',
                'trafficSource.keyword',
                'trafficSource.referralPath']
    for name in name_col:
        df[name].fillna('-',inplace= True)
    return df
train = DealCategorical_missingdata(train)
test = DealCategorical_missingdata(test)
test_fill =test.drop(['trafficSource.adwordsClickInfo.gclId'],axis=1)
train_fill = train.drop(['trafficSource.adwordsClickInfo.gclId'],axis=1)
train_fill['totals.transactionRevenue'] = train_fill['totals.transactionRevenue'].astype("float")

In [21]:
# train_fill.info(),test_fill.info()

In [22]:
def chanGroup_data(df):
    chanGroup_list=['Organic Search','Social','Direct','Referral','Paid Search','Affiliates','Display']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['channelGrouping']
        if label == 'Organic Search':
            return 'Organic Search'
        elif label == 'Social':
            return 'Social'
        elif label == 'Direct':
            return 'Direct'
        elif label == 'Referral':
            return 'Referral'
        elif label =='Paid Search':
            return 'Paid Search'
        elif label =='Affiliates':
            return 'Affiliates'        
        elif label =='Display':
            return 'Display'      
        else:
            return 'others'
    df['channelGrouping']=df.apply(replace_feature, axis=1)
    return df
train_fill = chanGroup_data(train_fill)
train_fill['channelGrouping'].value_counts()

Organic Search    381561
Social            226117
Direct            143026
Referral          104838
Paid Search        25326
Affiliates         16403
Display             6262
others               120
Name: channelGrouping, dtype: int64

In [23]:
def browser_data(df):
    browser_list=['Chrome','Safari','Firefox','Internet Explorer','Edge']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['device.browser']
        if label == 'Chrome':
            return 'Chrome'
        elif label == 'Safari':
            return 'Safari'
        elif label == 'Firefox':
            return 'Firefox'
        elif label == 'Internet Explorer':
            return 'Internet Explorer'
        elif label =='Edge':
            return 'Edge'
        else:
            return 'others'
    df['device.browser']=df.apply(replace_feature, axis=1)
    return df
train_fill = browser_data(train_fill)
train_fill['device.browser'].value_counts()

Chrome               620364
Safari               182245
Firefox               37069
others                34395
Internet Explorer     19375
Edge                  10205
Name: device.browser, dtype: int64

In [None]:
def deviceCategory_data(df):
    deviceCategory_list=['desktop','mobile','tablet']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['device.deviceCategory']
        if label == 'desktop':
            return 'desktop'
        elif label == 'mobile':
            return 'mobile'
        elif label == 'tablet':
            return 'tablet'
        else:
            return 'others'
    df['device.deviceCategory']=df.apply(replace_feature, axis=1)
    return df
train_fill = deviceCategory_data(train_fill)
train_fill['device.deviceCategory'].value_counts()

desktop    664479
mobile     208725
tablet      30449
Name: device.deviceCategory, dtype: int64

In [None]:
def isMobile_data(df):
    isMobile_list=['False','True']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['device.isMobile']
        if label == 'False':
            return 'False'
        elif label == 'True':
            return 'True'
        else:
            return 'others'
    df['device.isMobile']=df.apply(replace_feature, axis=1)
    return df
train_fill['device.isMobile'] = train_fill['device.isMobile'].astype("str")
train_fill = isMobile_data(train_fill)
train_fill['device.isMobile'].value_counts()

False    664530
True     239123
Name: device.isMobile, dtype: int64

In [None]:
def operatingSystem_data(df):
    operatingSystem_list=['Windows','Macintosh','Android','iOS','Linux','Chrome OS']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['device.operatingSystem']
        if label == 'Windows':
            return 'Windows'
        elif label == 'Macintosh':
            return 'Macintosh'
        elif label == 'Android':
            return 'Android'
        elif label == 'iOS':
            return 'iOS'
        elif label =='Linux':
            return 'Linux'
        elif label =='Chrome OS':
            return 'Chrome OS'
        else:
            return 'others'
    df['device.operatingSystem']=df.apply(replace_feature, axis=1)
    return df
train_fill = operatingSystem_data(train_fill)
train_fill['device.operatingSystem'].value_counts()

Windows      350072
Macintosh    253938
Android      123892
iOS          107665
Linux         35034
Chrome OS     26337
others         6715
Name: device.operatingSystem, dtype: int64

In [None]:
def continent_data(df):
    continent_list=['Americas','Asia','Europe','Oceania','Africa']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['geoNetwork.continent']
        if label == 'Americas':
            return 'Americas'
        elif label == 'Asia':
            return 'Asia'
        elif label == 'Europe':
            return 'Europe'
        elif label == 'Oceania':
            return 'Oceania'
        elif label =='Africa':
            return 'Africa'
        else:
            return 'others'
    df['geoNetwork.continent']=df.apply(replace_feature, axis=1)
    return df
train_fill = continent_data(train_fill)
train_fill['geoNetwork.continent'].value_counts()

Americas    450377
Asia        223698
Europe      198311
Oceania      15054
Africa       14745
others        1468
Name: geoNetwork.continent, dtype: int64

In [None]:
def metro_data(df):
    _list=['San Francisco-Oakland-San Jose CA','New York NY','Roanoke-Lynchburg VA']
    def replace_feature(x):
        label=x['geoNetwork.metro']
        if label == 'San Francisco-Oakland-San Jose CA':
            return 'CA'
        elif label == 'New York NY':
            return 'NY'
        elif label == 'Roanoke-Lynchburg VA':
            return 'VA'
        else:
            return 'others'
    df['geoNetwork.metro']=df.apply(replace_feature, axis=1)
    return df
train_fill = metro_data(train_fill)
train_fill['geoNetwork.metro'].value_counts()

others    778596
CA         95913
NY         26917
VA          2227
Name: geoNetwork.metro, dtype: int64

In [None]:
def networkDomain_data(df):
    _list=['comcastbusiness.net']
    def replace_feature(x):
        label=x['geoNetwork.networkDomain']
        if label == 'comcastbusiness.net':
            return 'business'
        else:
            return 'others'
    df['geoNetwork.networkDomain']=df.apply(replace_feature, axis=1)
    return df
train_fill = networkDomain_data(train_fill)
train_fill['geoNetwork.networkDomain'].value_counts()

others      893668
business      9985
Name: geoNetwork.networkDomain, dtype: int64

In [None]:
def subContinent_data(df):
    _list=['Northern America']
    def replace_feature(x):
        label=x['geoNetwork.subContinent']
        if label == 'Northern America':
            return 'NorAme'
        else:
            return 'others'
    df['geoNetwork.subContinent']=df.apply(replace_feature, axis=1)
    return df
train_fill = subContinent_data(train_fill)
train_fill['geoNetwork.subContinent'].value_counts()

others    512996
NorAme    390657
Name: geoNetwork.subContinent, dtype: int64

In [None]:
def region_data(df):
    _list=['California','New York','Texas','Washington','Illinois','Ontario']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['geoNetwork.region']
        if label == 'California':
            return 'California'
        elif label == 'New York':
            return 'New York'
        elif label == 'Texas':
            return 'Texas'
        elif label == 'Washington':
            return 'Washington'
        elif label =='Illinois':
            return 'Illinois'
        elif label =='Ontario':
            return 'Ontario'
        else:
            return 'others'
    df['geoNetwork.region']=df.apply(replace_feature, axis=1)
    return df
train_fill = region_data(train_fill)
train_fill['geoNetwork.region'].value_counts()

others        739752
California    107495
New York       26433
Texas           8749
Washington      7642
Illinois        7585
Ontario         5997
Name: geoNetwork.region, dtype: int64

In [None]:
def country_data(df):
    _list=['United States','India','United Kingdom','Canada','Vietnam','Japan']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['geoNetwork.country']
        if label == 'United States':
            return 'United States'
        elif label == 'India':
            return 'India'
        elif label == 'United Kingdom':
            return 'United Kingdom'
        elif label == 'Canada':
            return 'Canada'
        elif label =='Vietnam':
            return 'Vietnam'
        elif label =='Japan':
            return 'Japan'
        else:
            return 'others'
    df['geoNetwork.country']=df.apply(replace_feature, axis=1)
    return df
train_fill = country_data(train_fill)
train_fill['geoNetwork.country'].value_counts()

others            380178
United States     364744
India              51140
United Kingdom     37393
Canada             25869
Vietnam            24598
Japan              19731
Name: geoNetwork.country, dtype: int64

In [None]:
def city_data(df):
    _list=['Mountain View','New York','San Francisco','San Jose','Los Angeles','Chicago','Toronto','Seattle']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['geoNetwork.city']
        if label == 'Mountain View':
            return 'Mountain View'
        elif label == 'New York':
            return 'New York'
        elif label == 'San Francisco':
            return 'San Francisco'
        elif label == 'San Jose':
            return 'San Jose'
        elif label =='Los Angeles':
            return 'Los Angeles'
        elif label =='Chicago':
            return 'Chicago'
        elif label =='Toronto':
            return 'Toronto'
        elif label =='Seattle':
            return 'Seattle'
        else:
            return 'others'
    df['geoNetwork.city']=df.apply(replace_feature, axis=1)
    return df
train_fill = city_data(train_fill)
train_fill['geoNetwork.city'].value_counts()

others           779412
Mountain View     40884
New York          26371
San Francisco     20329
San Jose          10295
Los Angeles        8670
Chicago            7444
Toronto            5223
Seattle            5025
Name: geoNetwork.city, dtype: int64

In [None]:
def adNetworkType_data(df):
    _list=['Google Search','Search partners']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.adwordsClickInfo.adNetworkType']
        if label == 'Google Search':
            return 'Google Search'
        elif label == 'Search partners':
            return 'Search partners'
        else:
            return 'others'
    df['trafficSource.adwordsClickInfo.adNetworkType']=df.apply(replace_feature, axis=1)
    return df
train_fill = adNetworkType_data(train_fill)
train_fill['trafficSource.adwordsClickInfo.adNetworkType'].value_counts()

others             882193
Google Search       21453
Search partners         7
Name: trafficSource.adwordsClickInfo.adNetworkType, dtype: int64

In [None]:
def isVideoAd_data(df):
    _list=['False','True']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.adwordsClickInfo.isVideoAd']
        if label == 'False':
            return 'False'
#         elif label == 'True':
#             return 'True'
        else:
            return 'others'
    df['trafficSource.adwordsClickInfo.isVideoAd']=df.apply(replace_feature, axis=1)
    return df
train_fill['trafficSource.adwordsClickInfo.isVideoAd'] = train_fill['trafficSource.adwordsClickInfo.isVideoAd'].astype('str')
train_fill = isVideoAd_data(train_fill)
train_fill['trafficSource.adwordsClickInfo.isVideoAd'].value_counts()

others    882193
False      21460
Name: trafficSource.adwordsClickInfo.isVideoAd, dtype: int64

In [None]:
def campaign_data(df):
    _list=['AW - Dynamic Search Ads Whole Site','AW - Accessories','AW - Apparel']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.campaign']
        if label == 'AW - Dynamic Search Ads Whole Site':
            return 'AW1'
        elif label == 'AW - Accessories':
            return 'AW2'
        elif label == 'AW - Apparel':
            return 'AW3'
        else:
            return 'others'
    df['trafficSource.campaign']=df.apply(replace_feature, axis=1)
    return df
train_fill = campaign_data(train_fill)
train_fill['trafficSource.campaign'].value_counts()

others    882293
AW1        14244
AW2         7070
AW3           46
Name: trafficSource.campaign, dtype: int64

In [None]:
train_fill['trafficSource.adwordsClickInfo.slot'].value_counts()

-      882193
Top     20956
RHS       504
Name: trafficSource.adwordsClickInfo.slot, dtype: int64

In [None]:
def slot_data(df):
    _list=['RHS','Top']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.adwordsClickInfo.slot']
        if label == 'RHS':
            return 'RHS'
        elif label == 'Top':
            return 'Top'
        else:
            return 'others'
    df['trafficSource.adwordsClickInfo.slot']=df.apply(replace_feature, axis=1)
    return df
train_fill = slot_data(train_fill)
train_fill['trafficSource.adwordsClickInfo.slot'].value_counts()

others    882193
Top        20956
RHS          504
Name: trafficSource.adwordsClickInfo.slot, dtype: int64

In [None]:
def isTrueDirect_data(df):
    _list=['True']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.isTrueDirect']
        if label == 'True':
            return 'True'
        else:
            return 'others'
    df['trafficSource.isTrueDirect']=df.apply(replace_feature, axis=1)
    return df
train_fill['trafficSource.isTrueDirect'] = train_fill['trafficSource.isTrueDirect'].astype('str')
train_fill = isTrueDirect_data(train_fill)
train_fill['trafficSource.isTrueDirect'].value_counts()

others    629648
True      274005
Name: trafficSource.isTrueDirect, dtype: int64

In [None]:
def keyword_data(df):
    _list=['Google Merchandise','+Google +Merchandise']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.keyword']
        if label == 'Google Merchandise':
            return 'Google Merchandise'
        elif label == '+Google +Merchandise':
            return '+Google +Merchandise'
        else:
            return 'others'
    df['trafficSource.keyword']=df.apply(replace_feature, axis=1)
    return df
train_fill = keyword_data(train_fill)
train_fill['trafficSource.keyword'].value_counts()

others                  901653
Google Merchandise        1648
+Google +Merchandise       352
Name: trafficSource.keyword, dtype: int64

In [None]:
def medium_data(df):
    _list=['organic','referral','cpc','cpm']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.medium']
        if label == 'organic':
            return 'organic'
        elif label == 'referral':
            return 'referral'
        elif label == 'cpc':
            return 'cpc'
        elif label == 'cpm':
            return 'cpm'
        else:
            return 'others'
    df['trafficSource.medium']=df.apply(replace_feature, axis=1)
    return df
train_fill = medium_data(train_fill)
train_fill['trafficSource.medium'].value_counts()

organic     381561
referral    330955
others      159549
cpc          25326
cpm           6262
Name: trafficSource.medium, dtype: int64

In [None]:
def referralPath_data(df):
    _list=['/','/yt/about/','/mail/u/0/']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.referralPath']
        if label == '/':
            return '/'
        elif label == '/yt/about/':
            return '/yt/about/'
        elif label == '/mail/u/0/':
            return '/mail/u/0/'
        else:
            return 'others'
    df['trafficSource.referralPath']=df.apply(replace_feature, axis=1)
    return df
train_fill = referralPath_data(train_fill)
train_fill['trafficSource.referralPath'].value_counts()

others        756060
/              75523
/yt/about/     71036
/mail/u/0/      1034
Name: trafficSource.referralPath, dtype: int64

In [None]:
def source_data(df):
    _list=['google','(direct)','youtube.com','mall.googleplex.com','dfa']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.source']
        if label == 'google':
            return 'google'
        elif label == '(direct)':
            return '(direct)'
        elif label == 'youtube.com':
            return 'youtube.com'
        elif label == 'mall.googleplex.com':
            return 'mall.googleplex.com'
        elif label == 'dfa':
            return 'dfa'
        else:
            return 'others'
    df['trafficSource.source']=df.apply(replace_feature, axis=1)
    return df
train_fill = source_data(train_fill)
train_fill['trafficSource.source'].value_counts()

google                 400788
youtube.com            212602
(direct)               143028
others                  75133
mall.googleplex.com     66416
dfa                      5686
Name: trafficSource.source, dtype: int64

In [None]:
def adContent_data(df):
    _list=['Google Merchandise Collection','{KeyWord:Google Branded Gear}']
    #处理特殊的称呼，全处理成mr, mrs, miss, master
    def replace_feature(x):
        label=x['trafficSource.adContent']
        if label == 'Google Merchandise Collection':
            return 'Google Merchandise Collection'
        elif label == '{KeyWord:Google Branded Gear}':
            return '{KeyWord:Google Branded Gear}'
        else:
            return 'others'
    df['trafficSource.adContent']=df.apply(replace_feature, axis=1)
    return df
train_fill = adContent_data(train_fill)
train_fill['trafficSource.adContent'].value_counts()

others                           898464
Google Merchandise Collection      5122
{KeyWord:Google Branded Gear}        67
Name: trafficSource.adContent, dtype: int64

In [None]:
test = DealCategorical_missingdata(test)
test_fill =test.drop(['trafficSource.adwordsClickInfo.gclId'],axis=1)
test_fill = chanGroup_data(test_fill)
test_fill = browser_data(test_fill)
test_fill = deviceCategory_data(test_fill)
test_fill['device.isMobile'] = test_fill['device.isMobile'].astype("str")
test_fill = isMobile_data(test_fill)
test_fill = operatingSystem_data(test_fill)
test_fill = continent_data(test_fill)
test_fill = metro_data(test_fill)
test_fill = networkDomain_data(test_fill)
test_fill = subContinent_data(test_fill)
test_fill = region_data(test_fill)
test_fill = country_data(test_fill)
test_fill = city_data(test_fill)
test_fill = adNetworkType_data(test_fill)
test_fill['trafficSource.adwordsClickInfo.isVideoAd'] = test_fill['trafficSource.adwordsClickInfo.isVideoAd'].astype('str')
test_fill = isVideoAd_data(test_fill)
test_fill = campaign_data(test_fill)
test_fill = slot_data(test_fill)
test_fill['trafficSource.isTrueDirect'] = test_fill['trafficSource.isTrueDirect'].astype('str')
test_fill = isTrueDirect_data(test_fill)
test_fill = keyword_data(test_fill)
test_fill = medium_data(test_fill)
test_fill = referralPath_data(test_fill)
test_fill = source_data(test_fill)
test_fill = adContent_data(test_fill)
test_fill['trafficSource.adContent'].value_counts()

others                           803016
Google Merchandise Collection      1640
{KeyWord:Google Branded Gear}        28
Name: trafficSource.adContent, dtype: int64

In [None]:
test_fill.shape,train_fill.shape

((804684, 33), (903653, 34))

In [None]:
# test_fill.info(),train_fill.info()

In [None]:
# train_fill.to_csv(path+'traindata_select.csv')
# test_fill.to_csv(path+'testdata_select.csv')

In [None]:
# train_fill.columns
for df in [train_fill, test_fill]:
    df['date'] = pd.to_datetime(df['visitStartTime'].values, unit='s')
    df['day_of_week'] = df['date'].dt.dayofweek
    df['hour'] = df['date'].dt.hour
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month

In [None]:
feauture_list = ['channelGrouping','device.browser',
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType','trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath','trafficSource.source',
        'day_of_week','hour','day','month']

In [None]:
def data_to_dummy(df):
    global feauture_list
    for clos in feauture_list:
        tmp_dummy = pd.get_dummies(df[clos], prefix= clos)
        df = pd.concat([df, tmp_dummy], axis=1)
    print(feauture_list)   
    return df

In [None]:
# pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
data_ = data_to_dummy(train_fill)
train_dummies = data_.drop(feauture_list,axis =1)
# data_dummies.to_csv(path+'data_dummies')

['channelGrouping', 'device.browser', 'device.deviceCategory', 'device.isMobile', 'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region', 'geoNetwork.subContinent', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.medium', 'trafficSource.referralPath', 'trafficSource.source', 'day_of_week', 'hour', 'day', 'month']


In [None]:
useless_feature = ['sessionId', 'visitId', 'visitNumber','visitStartTime','date']
train_dummies = train_dummies.drop(useless_feature,axis =1)
train_dummies.head()

Unnamed: 0,fullVisitorId,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,trafficSource.adwordsClickInfo.page,channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1131660440785968503,1,1,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,377306020877927890,1,1,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3895546263509774583,1,1,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4763447161404445595,1,1,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,27294437909732085,1,1,0,1,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [None]:
train_dummies.to_csv(path+'train_dummies.csv')

In [None]:
data_ = data_to_dummy(test_fill)
test_dummies = data_.drop(feauture_list,axis =1)

In [None]:
test_dummies = test_dummies.drop(useless_feature,axis =1)
test_dummies.head()

In [None]:
print("Variables not in train but in test : ", set(test_dummies.columns).difference(set(train_dummies.columns)))

In [None]:
test_dummies['month_7'] = test_dummies['month_8']
test_dummies['month_6'] = test_dummies['month_5']
print("Variables not in test but in train : ", set(train_dummies.columns).difference(set(test_dummies.columns)))

In [None]:
test_dummies.head()

In [None]:
train_dummies.head()

In [None]:
# test_dummies.to_csv(path+'test_dummies.csv')
# train_dummies.to_csv(path+'train_dummies.csv')

In [None]:
###相关系数
# corr_matrix = train_dummies.corr()
# corr_matrix['totals.transactionRevenue'].sort_values(ascending=False)
##最大线性相关系数才0.04,说明关系是属于非线性的，用线性回归可能得不到好的结果。

In [None]:
train_Id = train_dummies['fullVisitorId']
test_Id = test_dummies['fullVisitorId']
# train_Id.values

In [None]:
train_feature = train_dummies.drop('fullVisitorId',axis = 1)
test_feature = test_dummies.drop('fullVisitorId',axis = 1)

In [None]:
test_feature.head()

In [None]:
##特征缩放,特征缩放时不应对独热向量进行缩放
####使用标准化特征缩放后存在负值，是否会对线性回归造成影响？
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
name_col = ['totals.bounces','totals.hits','totals.newVisits','totals.pageviews','trafficSource.adwordsClickInfo.page']
for col in name_col:
    train_feature[col] = scalar.fit_transform(train_feature[[col]])
    test_feature[col] = scalar.fit_transform(test_feature[[col]])

In [None]:
# 建立新属性revenue_status，根据有无收入的比例来进行分层采样
train_feature['revenue_status']=train_feature['totals.transactionRevenue'].apply(lambda x: 0 if x==0 else 1)
train_feature[strat_train_set].value_counts()

In [None]:
###利用分层采样，先抽出一部分训练集用于最后的stcking
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, stack_index in split.split(train_feature,train_feature['revenue_status']):
    strat_train_set = train_feature.loc[train_index]
    strat_stack_set = train_feature.loc[stack_index]

In [None]:
strat_stack_set['revenue_status'].value_counts()/len(strat_stack_set)

In [None]:
train_feature['revenue_status'].value_counts()/len(train_feature)
# train_feature.values

In [None]:
import lightgbm as lgb
# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
   
    lgtrain = lgb.Dataset(train_X, label=train_y )
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, model, pred_val_y

In [None]:
from sklearn.metrics import mean_squared_error
def rsme(y,pred):
    return(mean_squared_error(y,pred)**0.5)
##基于 Linear Regression 的cross validatipn和bagging
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
import lightgbm as lgb
step = 0;
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
for train_index, val_index in split.split(train_feature,train_feature['revenue_status']):
    train_set = train_feature.loc[train_index]
    val_set = train_feature.loc[val_index]
    train_set = train_set.drop('revenue_status',axis = 1)###去除无用行
    val_set = val_set.drop('revenue_status',axis = 1)
    
    train_target = train_set['totals.transactionRevenue']
    val_target = val_set['totals.transactionRevenue']
    
    train_set = train_set.drop('totals.transactionRevenue',axis=1)
    val_set = val_set.drop('totals.transactionRevenue',axis=1)
    step += 1
    pred_test, model, pred_val=run_lgb(train_set,np.log1p(train_target),val_set,np.log1p(val_target),train_set)
#     lin_reg = SVR(kernel="poly",degree=2,C=100,epsilon=0.1)
#     lin_reg.fit(train_set.values,train_target)
#     pred = lin_reg.predict(train_set)
    err = mean_squared_error(np.log1p(val_target),np.log1p(pred_val))
    print("traning step : ",step)
    print("val rmse : ",err)
    