In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math

from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from pandas.io.json import json_normalize
import json
import gc
gc.enable()
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
def load_df(csv_path='/home/baitong/pywork/RevenuePrediction/all/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
#     print(df.head())
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


In [2]:
%%time
##读取数据
train_df = load_df()
test_df = load_df("/home/baitong/pywork/RevenuePrediction/all/test.csv")

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)
CPU times: user 3min 51s, sys: 4.33 s, total: 3min 55s
Wall time: 3min 55s


In [3]:
#All functions

#FUNCTION FOR PROVIDING FEATURE SUMMARY
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['Null','Unique_Count','Data_type','Max/Min','Mean','Std','Skewness','Sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'Max/Min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'Mean']=df_fa[col].mean()
            df.at[col,'Std']=df_fa[col].std()
            df.at[col,'Skewness']=df_fa[col].skew()
        df.at[col,'Sample_values']=list(df_fa[col].unique())
           
    return(df.fillna('-'))

#FUNCTION FOR READING DICTIONARY ITEMS AND HANDLING KEYERROR
def get_val(x,col):
    try:
        y=x[col]
    except:
        y=np.nan
    return(y)

#FUNCTION FOR CALCULATING RSME
def rsme(y,pred):
    return(mean_squared_error(y,pred)**0.5)

In [4]:
# train_df.head()

In [5]:
# feature_summary(train_df)

In [6]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
#获取不变的常量列，模型无法在常量数据计学到东西，数据与处理时需要drop
const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False)==1 ]
train_df = train_df.drop(const_cols , axis=1)
test_df = test_df.drop(const_cols, axis=1)

In [7]:
train_df = train_df.drop(["trafficSource.campaignCode"], axis=1)

In [71]:
# feature_summary(train_df)
df_combi=pd.concat([train_df,test_df],ignore_index=True)


In [72]:
# train_tmp = train_df
df_combi.to_csv("/home/baitong/pywork/RevenuePrediction/all/df_combi.csv")

In [10]:
train_df['date'] = pd.to_datetime(train_df['visitStartTime'], unit='s')
train_df['day_of_week'] = train_df['date'].dt.dayofweek
train_df['hour'] = train_df['date'].dt.hour
train_df['day'] = train_df['date'].dt.day
train_df['month'] = train_df['date'].dt.month
train_df['totals.transactionRevenue'].fillna(0, inplace=True)
train_df['revenue_status']=train_df['totals.transactionRevenue'].apply(lambda x: 0 if x==0 else 1)

In [11]:
train_df['revenue_status'].value_counts()

0    892138
1     11515
Name: revenue_status, dtype: int64

In [12]:
# train_df.head()
#CONVERTING ALL THE STRINGS IN CATEGORICAL FEATURES TO LOWER CASE
for col in train_df.columns:
    if ((train_df[col].dtype=='object') & (col!='fullVisitorId')):
        train_df[col]=train_df[col].apply(lambda x:str(x).lower())
        
#REPLACING STRING 'nan' WITH np.nan
train_df.replace('nan',np.nan,inplace=True)

In [13]:
# train_df.head()

In [14]:
#CONVERTING CATEGORICAL FEATURES (LESS THAN 10 UNIQUE VALUES) TO DUMMIES
# train_df.drop(['device_isMobile'],axis=1,inplace=True)

cat_col=['channelGrouping','device.deviceCategory','trafficSource.adwordsClickInfo.slot',
         'trafficSource.adwordsClickInfo.adNetworkType',
         'trafficSource.adwordsClickInfo.isVideoAd','trafficSource.medium',
        'geoNetwork.continent']
    
dummy=pd.DataFrame()
col_name = ['channelGrouping','deviceCategory','tsadwordsClickInfo_slot',
                  'tsadwordsClickInfo_adNetworkType',
                   'tsadwordsClickInfo_isVideoAd','tsmedium',
            'geoNetwork_continent']
for col,name in zip(cat_col,col_name):
    dummy=pd.concat([dummy,pd.get_dummies(train_df[col],prefix=name)],axis=1)
    
print('Newly created dummy cols:',len(dummy.columns))
train_df=pd.concat([train_df,dummy],axis=1)
train_df.drop(cat_col,axis=1,inplace=True)


Newly created dummy cols: 29


In [15]:
# train_df.head()

In [16]:
#SOME BASIC DATA CLEANUP
train_df['totals.newVisits'].fillna(0,inplace=True) 
train_df['totals.bounces'].fillna(0,inplace=True)
train_df['trafficSource.adwordsClickInfo.page'].fillna(0,inplace=True)
train_df['trafficSource.isTrueDirect'].replace({np.nan:0,'true':1},inplace=True)

In [17]:
train_df.drop(['device.isMobile'],axis=1,inplace=True)

In [73]:
#GENERATING RANKS FOR CATEGORICAL FEATURES WITH UNIQUE VALUES GREATER THAN 10
#RANKS ARE GENERATED USING REVENUE PERCENTAGE
cols=[x for x in train_df.columns if x not in ['fullVisitorId','sessionId','geoNetwork.networkDomain','trafficSource.adwordsClickInfo.gclId']]
for col in cols:
    if train_df[col].dtype=='object':
        train_df[col].fillna('others',inplace=True)
        col_list=['revenue_status','totals.transactionRevenue']
        col_list.append(col)
        print(col_list)
        df=train_df[col_list].groupby(col).aggregate({col:['count'],'revenue_status':['sum'],'totals.transactionRevenue':['sum']}).reset_index()
        
        df.columns=[col,col+"_count",'revenue_status_sum','totals.transactionRevenue_sum']
        df['revenue_perc']=df['totals.transactionRevenue_sum']/df[col+"_count"]
        df['rank']=df['revenue_perc'].rank(ascending=1)
#         print(df.head(1))
        replace_dict={}
        final_dict={}
        #将每一个col列中的值按照rank排名 生成一个字典。即 {key=df[col].values：rank_value }  
        for k,col_val in enumerate(df[col].values):
            replace_dict[col_val]=df.iloc[k,5]
        
        final_dict[col]=replace_dict
        #用排名替换原值
        train_df.replace(final_dict,inplace=True)
        del df,replace_dict,final_dict
        gc.collect()

In [19]:
train_df.to_csv("/home/baitong/pywork/RevenuePrediction/get_dummie1.csv")

In [20]:
#将字符串转换为数字
train_df['geoNetwork.networkDomain'],unique=pd.factorize(train_df['geoNetwork.networkDomain'])
train_df['trafficSource.adwordsClickInfo.gclId'],unique1=pd.factorize(train_df['trafficSource.adwordsClickInfo.gclId'])

In [45]:
train_df.to_csv("/home/baitong/pywork/RevenuePrediction/get_factotize1.csv")

In [42]:
X=train_df.drop(['sessionId','visitId','date','geoNetwork.networkDomain','trafficSource.adwordsClickInfo.gclId'],axis=1)

In [46]:
%%time
agg_func={}
agg_col=['fullVisitorId']
for col in [x for x in X.columns if x not in ['fullVisitorId']]:
    if col=='totals_transactionRevenue':
        agg_func[col]=['sum']
        agg_col.append(str(col)+'_sum')
    elif col=='revenue_status':
        agg_func[col]=['sum']
        agg_col.append(str(col)+'_sum')
    else:
        agg_func[col]=['sum','max','min','mean','var','std']
        agg_col.append(str(col)+'_sum')
        agg_col.append(str(col)+'_max')
        agg_col.append(str(col)+'_min')
        agg_col.append(str(col)+'_mean')
        agg_col.append(str(col)+'_var')
        agg_col.append(str(col)+'_std')

CPU times: user 213 µs, sys: 2 µs, total: 215 µs
Wall time: 220 µs


In [51]:
%%time
X=X.groupby(X.fullVisitorId).aggregate(agg_func).reset_index()
X.columns=agg_col

CPU times: user 18.8 s, sys: 2.06 s, total: 20.9 s
Wall time: 20.9 s


In [53]:
X.to_csv("/home/baitong/pywork/RevenuePrediction/X_agg.csv")

In [55]:
%%time

#CREATING y_dummy FOR USING STRATIFIED KFOLD
y_dummy=X['revenue_status_sum'].apply(lambda x: 0 if x==0 else 1)

#TARGET FEATURE CONVERTED TO NATURAL LOG
# y=pd.Series(X['totals_transactionRevenue_sum'])
y=X['totals.transactionRevenue_sum'].apply(lambda x: np.log1p(x))

#PEPARING DATA FOR TRAINING LGBM MODEL
X=X.drop(['totals.transactionRevenue_sum','fullVisitorId','revenue_status_sum'],axis=1)

# #FINAL DATAFRAME FOR SUBMISSION
# col=['fullVisitorId','totals.transactionRevenue_sum']
# final=X_test[col] 
# final.columns=['fullVisitorId','PredictedLogRevenue']

# #FINAL TEST FEATURES USED FOR PREDICTING SUBMISSION
# X_test=X_test.drop(['fullVisitorId','totals_transactionRevenue_sum','revenue_status_sum'],axis=1)

CPU times: user 1.16 s, sys: 312 ms, total: 1.47 s
Wall time: 1.47 s


In [57]:
%%time
#LGBMRegressor. THIS REQUIRES FURTHER PARAMETER TUNINIG
model=LGBMRegressor(boosting_type='gbdt',num_leaves=31,max_depth=-1,learning_rate=0.01,n_estimators=1000,max_bin=255,subsample_for_bin=50000,
              objective=None,min_split_gain=0,min_child_weight=3,min_child_samples=10,subsample=1,subsample_freq=1,colsample_bytree=1,
              reg_alpha=0.1,reg_lambda=0,seed=17,silent=False,nthread=-1,n_jobs=-1)


k=1
splits=5
avg_score=0


skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
print('\nStarting KFold iterations...')
for train_index,test_index in skf.split(X,y_dummy):
    df_X=X.iloc[train_index,:]
    df_y=y.iloc[train_index]
    val_X=X.iloc[test_index,:]
    val_y=y.iloc[test_index]

    model.fit(df_X,df_y)

    preds_x=pd.Series(model.predict(val_X))
    acc=rsme(val_y,preds_x)
    print('Iteration:',k,'  rmse:',acc)
    
#     if k==1:
#         score=acc
#         model1=model
#         preds=pd.Series(model.predict(X_test))
        
#     else:
#         preds1=pd.Series(model.predict(X_test))
#         preds=preds+preds1
#         if score>acc:
#             score=acc
#             model1=model
    avg_score=avg_score+acc        
    k=k+1
# print('\n Best score:',score,' Avg Score:',avg_score/splits)
# preds=preds/splits


Starting KFold iterations...
Iteration: 1   rmse: 0.011243488312458461
Iteration: 2   rmse: 0.011804135559095994
Iteration: 3   rmse: 0.02023732194987488
Iteration: 4   rmse: 0.011657404694332095
Iteration: 5   rmse: 0.011608726923658247


NameError: name 'score' is not defined