In [1]:
#import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# data partition/scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

# models
from sklearn.linear_model import LogisticRegression

# metric
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# open the dataset
df = pd.read_csv('train.csv')

## Feature Engineering

In [3]:
# Converting Date type from object to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
# creating variables from month data
df["Month"]=df["Date"].map(lambda x: x.month)
df["Months_high"]=df["Month"].map(lambda x: 1 if x>7 & x<12 else 0)

In [4]:
# 0 - Returner/ 1 - New_Access/ 2 - Other
dict_visitor = {'Returner': 0, 'New_Access': 1, 'Other': 0}
df['Type_of_Visitor_new'] = df['Type_of_Visitor'].map(dict_visitor)
dict_visitor = {'Returner': 1, 'New_Access': 0, 'Other': 0}
df['Type_of_Visitor_return'] = df['Type_of_Visitor'].map(dict_visitor)

# Train Test Split

In [5]:
df_model = df[["Months_high","Type_of_Visitor_return",'GoogleAnalytics_ExitRate',  'GoogleAnalytics_PageValue',"Buy"]]

In [6]:
X = df_model.drop('Buy', axis=1)
y = df_model['Buy']

In [7]:
# split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01, random_state=5, stratify=y)

# Variable power transformations after train_test_split

In [8]:
# Define metric and non-metric features
metric_features = ['GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue']
non_metric_features = X.columns.drop(metric_features).to_list()

In [9]:
#separate numeric and non-numeric
X_train_num = X_train[metric_features]
X_train_cat = X_train[non_metric_features]
# DO IT for validation
X_val_num = X_val[metric_features]
X_val_cat = X_val[non_metric_features]

In [10]:
#use train to power transform train
power = PowerTransformer().fit(X_train_num)
X_train_num_power = power.transform(X_train_num)
X_train_num_power = pd.DataFrame(X_train_num_power, columns = X_train_num.columns).set_index(X_train_num.index)

#and for validation (using train data)
X_val_num_power = power.transform(X_val_num)
# Convert the array to a pandas dataframe
X_val_num_power = pd.DataFrame(X_val_num_power, columns = X_val_num.columns).set_index(X_val_num.index)
X_val_num_power.head(3)

X_train_power = pd.concat([X_train_num_power, X_train_cat], axis=1)
X_val_power = pd.concat([X_val_num_power, X_val_cat], axis=1)

## Balance the training set - undersampling

In [11]:
# Before undersampling
y_train.value_counts()

0    8363
1    1536
Name: Buy, dtype: int64

In [12]:
#joining y_train and x_train
under_df = pd.concat([X_train_power,y_train],axis=1)

In [13]:
#we randomly undersample our negative data until negatives our buy rate is 20% 
negative = under_df.loc[under_df.Buy==0]
positive = under_df.loc[under_df.Buy==1]
# shuffling the negative data
negative = negative.sample(frac=1, random_state=0)
# selecting 4 times the length of positive data
negative_under = negative[:int(4*len(positive))]
# concatinating dataset
df_under = pd.concat([positive,negative_under],axis=0)
#shuffling
df_under = df_under.sample(frac=1, random_state=0)

In [14]:
# After undersampling
X_train_power = df_under.drop('Buy', axis=1)
y_train_under = df_under['Buy']
y_train_under.value_counts()

0    6144
1    1536
Name: Buy, dtype: int64

# final model

In [15]:
# model with 25% test set, undersampling of length 4, and 4 variables
model_log = LogisticRegression(class_weight='balanced',solver='lbfgs',tol=.01).fit(X_train_power, y_train_under)

y_pred_train = model_log.predict(X_train_power)
y_pred_val = model_log.predict(X_val_power)
model_log.coef_

array([[-0.34881148,  1.51668393,  1.42534148, -0.52842918]])

In [16]:
#### low precision can be improved slightly by increasing your threshold, making your model less sensitive
y_pred_val_new_threshold = (model_log.predict_proba(X_val_power)[:,1]>=0.75).astype(int)
print('f1_train:', f1_score(y_train_under, y_pred_train))
print(confusion_matrix(y_val, y_pred_val_new_threshold))
print('precision:', precision_score(y_val, y_pred_val_new_threshold))
print('f1_val:', f1_score(y_val, y_pred_val_new_threshold))
print('True validation is .664, this value is so low because we training with 99% of the data for the last submission to kaggle')

f1_train: 0.7143270622286542
[[72 12]
 [ 7  9]]
precision: 0.42857142857142855
f1_val: 0.4864864864864864
True validation is .664, this value is so low because we training with 99% of the data for the last submission to kaggle


## Dataset for test

In [17]:
test_set = pd.read_csv('test.csv')

In [18]:
# select the columns for the model

# 0 - Returner/ 1 - New_Access/ 2 - Other
dict_visitor = {'Returner': 1, 'New_Access': 0, 'Other': 0}
test_set['Type_of_Visitor_return'] = test_set['Type_of_Visitor'].map(dict_visitor)

#months variable
test_set['Date'] = pd.to_datetime(test_set['Date'], format='%d-%b-%y')
test_set["Month"]=test_set["Date"].map(lambda x: x.month)
test_set["Months_high"]=test_set["Month"].map(lambda x: 1 if x>7 & x<12 else 0)

test = test_set[["Months_high","Type_of_Visitor_return",
                 'GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue']]

In [19]:
#define numeric features
metric_features = ['GoogleAnalytics_ExitRate','GoogleAnalytics_PageValue']
non_metric_features = X.columns.drop(metric_features).to_list()
test_num = test[metric_features]
test_cat = test[non_metric_features]

#power transform - still using training dataset
test_num_power = power.transform(test_num)
test_num_power = pd.DataFrame(test_num_power, columns = test_num.columns).set_index(test_num.index)

test_power = pd.concat([test_num_power, test_cat], axis=1)
test_power.columns

Index(['GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue', 'Months_high',
       'Type_of_Visitor_return'],
      dtype='object')

In [20]:
y_pred_new_threshold = (model_log.predict_proba(test_power)[:,1]>=0.75).astype(int)

In [21]:
df_submission = pd.concat([test_set['Access_ID'], pd.DataFrame(columns=['Buy'], data=y_pred_new_threshold)], axis=1)
df_submission['Buy'].value_counts()

0    1879
1     421
Name: Buy, dtype: int64

In [22]:
#logistic_undersampling with length 4_power_lessfeatures_newthresh
df_submission.to_csv('Group17_Version40.csv', index=False)

#final score on kaggle: 0.66046 