In [34]:
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from scipy import stats


In [35]:


def load_df(csv_path='./data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    #p=0.1
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, nrows=nrows,) # Important!!
                      #skiprows=lambda i: i > 0 and random.random() > p)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train_df = load_df()
print("loaded")


Loaded train.csv. Shape: (903653, 55)
loaded


In [31]:
#pd.set_option('display.max_columns', None)

train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
train_df['shops or not'] = train_df['totals.transactionRevenue'].values > 0

print(pd.value_counts(train_df['shops or not']))

def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day

date_format(train_df)

excluded_features =['socialEngagementType','totals.visits','totals.newVisits','device.browserVersion','device.flashVersion',

         'device.language','device.browserSize','device.mobileDeviceInfo','device.mobileDeviceMarketingName',

         'device.mobileDeviceModel','device.mobileInputSelector','device.operatingSystemVersion',

         'device.screenColors','device.screenResolution','device.mobileDeviceBranding','geoNetwork.cityId',

          'geoNetwork.latitude','geoNetwork.longitude','geoNetwork.networkLocation',

          'trafficSource.adwordsClickInfo.criteriaParameters',

                    # 'trafficSource.campaignCode',

            'trafficSource.campaign','trafficSource.adContent','trafficSource.adwordsClickInfo.adNetworkType',

            'trafficSource.adwordsClickInfo.gclId','trafficSource.adwordsClickInfo.isVideoAd',

            'trafficSource.adwordsClickInfo.page','trafficSource.adwordsClickInfo.slot']

A=train_df.drop(excluded_features,axis=1)

print(A.shape)

replace_null_values={'trafficSource.isTrueDirect': 'False', 'trafficSource.keyword': 'unknown',

                     'trafficSource.referralPath': 'unknown'}

B=A.fillna(value=replace_null_values)


categorical_features = [

    _f for _f in B.columns

    if (B[_f].dtype == 'object')

]



#print(categorical_features)



for f in categorical_features:

    B[f], indexer = pd.factorize(B[f])



M=B.drop(['shops or not','date','vis_date','totals.transactionRevenue'],axis=1)

y=B['shops or not']

# y=train_df['shops or not']



#28 columns

print(M.shape)

False    88953
True      1215
Name: shops or not, dtype: int64
(90168, 32)
(90168, 28)


In [5]:
# feature selection
from sklearn.ensemble import ExtraTreesClassifier
# here criterion='entropy'  for the information gain, 'gini' for the Gini impurity 
model = ExtraTreesClassifier(criterion='entropy')
model.fit(M,y)

imp_df = pd.DataFrame()
imp_df['feature'] = M.columns
imp_df['importance'] = model.feature_importances_

print(imp_df.sort_values('importance', ascending=False))

imp_df = imp_df[imp_df['importance'].astype('float') < 0.0005] 

dropFeatures = imp_df['feature'].values 
print(dropFeatures)

M=M.drop(dropFeatures,axis=1)

                       feature  importance
20   totals.transactionRevenue    0.346028
18                 totals.hits    0.171408
19            totals.pageviews    0.136107
17              totals.bounces    0.065808
11        geoNetwork.continent    0.026451
16     geoNetwork.subContinent    0.025288
0              channelGrouping    0.023333
3                      visitId    0.018897
5               visitStartTime    0.017005
1                fullVisitorId    0.016910
21  trafficSource.isTrueDirect    0.015816
2                    sessionId    0.015692
9       device.operatingSystem    0.013143
14    geoNetwork.networkDomain    0.012920
13            geoNetwork.metro    0.012673
4                  visitNumber    0.012065
23        trafficSource.medium    0.011743
15           geoNetwork.region    0.010564
10             geoNetwork.city    0.009194
8              device.isMobile    0.008935
12          geoNetwork.country    0.006646
6               device.browser    0.005671
25        t

In [32]:
print(M.shape)

(90168, 28)


In [33]:
from sklearn.model_selection import KFold, GroupKFold
from imblearn.over_sampling import SMOTE


folds = GroupKFold(n_splits=5)
oof_clf_preds = np.zeros(M.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(M, y, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y.iloc[trn_]
    val_x, val_y = M.iloc[val_], y.iloc[val_]

    #using smote     
    sm = SMOTE(random_state=2)
    trn_xSampled,trn_ySampled = sm.fit_sample(trn_x, trn_y)
 
    #using class weight for imbalanced data
    #DecTreeModel = DecisionTreeClassifier(class_weight={0:1,1:7})
    DecTreeModel = DecisionTreeClassifier()
    DecTreeModel.fit(trn_xSampled,trn_ySampled)
    oof_clf_preds[val_] = DecTreeModel.predict_proba(val_x)[:,1]

print(classification_report(y, oof_clf_preds))    
print("prob")
print(M.shape)
print(oof_clf_preds.shape)


              precision    recall  f1-score   support

       False       0.99      0.99      0.99     88953
        True       0.31      0.38      0.34      1215

   micro avg       0.98      0.98      0.98     90168
   macro avg       0.65      0.68      0.66     90168
weighted avg       0.98      0.98      0.98     90168

prob
(90168, 28)
(90168,)


In [19]:
M['non_zero_proba'] = oof_clf_preds
print(M.shape)
from sklearn.tree import DecisionTreeRegressor

oof_reg_preds = np.zeros(M.shape[0])

y1 = train_df["totals.transactionRevenue"].fillna(0)

for fold_, (trn_, val_) in enumerate(folds.split(M, y1, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y1.iloc[trn_]
    val_x, val_y = M.iloc[val_], y1.iloc[val_]
    
    DecTreeReg = DecisionTreeRegressor()
    DecTreeReg.fit(trn_x,trn_y)
    oof_reg_preds[val_] = DecTreeReg.predict(val_x)


print(stats.describe(y1))    
print(stats.describe(oof_reg_preds))


from sklearn.metrics import mean_squared_error    
mean_squared_error(np.log1p(y1), np.log1p(oof_reg_preds)) ** .5 


(90303, 30)
DescribeResult(nobs=90303, minmax=(0.0, 5614440000.0), mean=1712960.5882418081, variance=1502548889249260.5, skewness=69.13280512054052, kurtosis=7407.163354691013)
DescribeResult(nobs=90303, minmax=(0.0, 1691500000.0), mean=1613638.3065900358, variance=904373456715624.6, skewness=35.45837259559211, kurtosis=1548.9728860534713)


0.0312636900211142