In [147]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean, stdev

# data partition
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LassoCV

from sklearn.feature_selection import RFE

# metric
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

# oversampling
from imblearn.over_sampling import RandomOverSampler

# Kfold
from sklearn.model_selection import StratifiedKFold

# scale
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

## Preprocessing

In [148]:
# open the dataset
df = pd.read_csv('train.csv')

In [149]:
# data types transformation
# Converting Date type from object to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
df["Month"]=df["Date"].map(lambda x: x.month)
df["Months_high"]=df["Month"].map(lambda x: 1 if x>7 & x<12 else 0)

## New features

In [150]:
# 0 - PC/ 1 - Mobile
dict_os = {'MacOSX': 0, 'Windows': 1, 'Android': 0, 'Ubuntu': 0, 'iOS': 0, 'Other': 0, 'Fedora': 0, 'Chrome OS':0}
df['OS_window'] = df['OS'].map(dict_os)

In [151]:
# 0 - Returner/ 1 - New_Access/ 2 - Other
dict_visitor = {'Returner': 0, 'New_Access': 1, 'Other': 0}
df['Type_of_Visitor_new'] = df['Type_of_Visitor'].map(dict_visitor)
dict_visitor = {'Returner': 1, 'New_Access': 0, 'Other': 0}
df['Type_of_Visitor_return'] = df['Type_of_Visitor'].map(dict_visitor)

In [152]:
df["Type_of_Traffic_high"]=df["Type_of_Traffic"].map(lambda x: 1 if (x in [7,8,15]) else 0)
df["Type_of_Traffic_med"]=df["Type_of_Traffic"].map(lambda x: 1 if (x in [10,11,2,5]) else 0)
df["Type_of_Traffic_high"].value_counts()

0    9528
1     471
Name: Type_of_Traffic_high, dtype: int64

## Remove Outliers

In [153]:
# trashold for outliers
filters2 = (
    (df['GoogleAnalytics_PageValue']<=350)
#    &(df['GoogleAnalytics_ExitRate']<.2)
)

df_1 = df[filters2]

print('Percentage of data kept after removing outliers:', np.round(df_1.shape[0] / df.shape[0], 4))

Percentage of data kept after removing outliers: 0.9998


# Choosing final features

In [154]:
df.columns

Index(['Access_ID', 'Date', 'AccountMng_Pages', 'AccountMng_Duration',
       'FAQ_Pages', 'FAQ_Duration', 'Product_Pages', 'Product_Duration',
       'GoogleAnalytics_BounceRate', 'GoogleAnalytics_ExitRate',
       'GoogleAnalytics_PageValue', 'OS', 'Browser', 'Country',
       'Type_of_Traffic', 'Type_of_Visitor', 'Buy', 'Month', 'Months_high',
       'OS_window', 'Type_of_Visitor_new', 'Type_of_Visitor_return',
       'Type_of_Traffic_high', 'Type_of_Traffic_med'],
      dtype='object')

In [None]:
# .69 score on kaggle
# df_log3 = df[["Months_high","Type_of_Visitor_new",'GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue',"Buy"]]
# undersampling of length 4. 
# exit rate and page value both power transformed
# with outlier threshold of 350 only for page value

In [155]:
df_log3 = df[["Months_high","Type_of_Visitor_new",'GoogleAnalytics_ExitRate',  'GoogleAnalytics_PageValue',"Buy"]]

In [108]:
#df_log4 = df[["Type_of_Visitor_new",'GoogleAnalytics_ExitRate',  'GoogleAnalytics_PageValue',"Buy"]]

In [157]:
X = df_log3.drop('Buy', axis=1)
y = df_log3['Buy']

## Balance the training set - test Oversampling

In [158]:
# split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=5, stratify=y)

In [159]:
# Before Oversampling
print('X:', X_train.shape)
print('y:', y_train.shape)
y_train.value_counts()

X: (7499, 4)
y: (7499,)


0    6335
1    1164
Name: Buy, dtype: int64

In [160]:
#removing outliers after train-test
under = pd.concat([X_train,y_train],axis=1)
#under = under[filters2]

In [161]:
#balance our data - will remove negatively labeled data until the number of positives and negatives are the same. 
negative = under.loc[under.Buy==0]
positive = under.loc[under.Buy==1]
negative = negative.sample(frac=1, random_state=0)
negative_under = negative[:int(4*len(positive))]
df_under = pd.concat([positive,negative_under],axis=0)
df_under = df_under.sample(frac=1, random_state=0)

In [162]:
# After undersampling
X_train_under = df_under.drop('Buy', axis=1)
y_train_under = df_under['Buy']
y_train_under.value_counts()

0    4656
1    1164
Name: Buy, dtype: int64

# Variable power transformations after train_test_split

In [163]:
# Define metric and non-metric features
metric_features = ['GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue']
non_metric_features = X.columns.drop(metric_features).to_list()

In [164]:
#separate numeric and non-numeric
X_train_num = X_train_under[metric_features]
X_train_cat = X_train_under[non_metric_features]
# DO IT for validation
X_val_num = X_val[metric_features]
X_val_cat = X_val[non_metric_features]

In [165]:
#use train to power transform train and test
power = PowerTransformer().fit(X_train_num)
X_train_num_power = power.transform(X_train_num)
X_train_num_power = pd.DataFrame(X_train_num_power, columns = X_train_num.columns).set_index(X_train_num.index)

#and for validation
X_val_num_power = power.transform(X_val_num)
# Convert the array to a pandas dataframe
X_val_num_power = pd.DataFrame(X_val_num_power, columns = X_val_num.columns).set_index(X_val_num.index)
X_val_num_power.head(3)

X_train_power = pd.concat([X_train_num_power, X_train_cat], axis=1)
X_val_power = pd.concat([X_val_num_power, X_val_cat], axis=1)

# RFE

In [166]:
#no of features
nof_list=np.arange(1,7)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    model = LogisticRegression(class_weight='balanced').fit(X_train_power, y_train_under)
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train_power,y_train_under)
    X_val_rfe = rfe.transform(X_val_power)
    model.fit(X_train_rfe,y_train_under)
    
    score = f1_score(y_val, (rfe.predict_proba(X_val_power)[:,1]>=0.75).astype(int))
    score_list.append(score)
    
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 3
Score with 3 features: 0.660592


In [167]:
model = LogisticRegression(class_weight='balanced').fit(X_train_power, y_train_under)
rfe = RFE(estimator = model, n_features_to_select = 3)
X_rfe = rfe.fit_transform(X = X_train_power, y = y_train_under)

selected_features = pd.Series(rfe.support_, index = X_train_under.columns)
selected_features
#GoogleAnalytics_PageValue, Months_high, Type_of_Visitor_new

Months_high                  False
Type_of_Visitor_new           True
GoogleAnalytics_ExitRate      True
GoogleAnalytics_PageValue     True
dtype: bool

# final model

In [168]:
#with 25% test set, undersampling of length 4, and 4 variables (df_log3)
model_log6 = LogisticRegression(class_weight='balanced').fit(X_train_power, y_train_under)

y_pred_train = model_log6.predict(X_train_power)
y_pred_val = model_log6.predict(X_val_power)
model_log6.coef_

array([[-0.34745727,  1.56551344,  1.27649493,  0.51907734]])

In [169]:
#low precision can be improved by increasing your threshold, making your model less sensitive
y_pred_val_new_threshold = (model_log6.predict_proba(X_val_power)[:,1]>=0.7).astype(int)
print('f1_train:', f1_score(y_train_under, y_pred_train))
print(confusion_matrix(y_val, y_pred_val_new_threshold))
print('precision:', precision_score(y_val, y_pred_val_new_threshold))
print('f1_val:', f1_score(y_val, y_pred_val_new_threshold))

f1_train: 0.7101174687381584
[[1900  212]
 [  91  297]]
precision: 0.5834970530451866
f1_val: 0.6622073578595317


## Dataset for test

In [170]:
test_set = pd.read_csv('test.csv')
# test_set.set_index('Access_ID', inplace = True)

In [171]:
# select the columns for the model

# 0 - PC/ 1 - Mobile
dict_os = {'MacOSX': 0, 'Windows': 1, 'Android': 0, 'Ubuntu': 0, 'iOS': 0, 'Other': 0, 'Fedora': 0, 'Chrome OS':0}
test_set['OS_window'] = test_set['OS'].map(dict_os)

# 0 - Returner/ 1 - New_Access/ 2 - Other
dict_visitor = {'Returner': 0, 'New_Access': 1, 'Other': 0}
test_set['Type_of_Visitor_new'] = test_set['Type_of_Visitor'].map(dict_visitor)
dict_visitor = {'Returner': 1, 'New_Access': 0, 'Other': 0}
test_set['Type_of_Visitor_return'] = test_set['Type_of_Visitor'].map(dict_visitor)

#months variable
test_set['Date'] = pd.to_datetime(test_set['Date'], format='%d-%b-%y')
test_set["Month"]=test_set["Date"].map(lambda x: x.month)
test_set["Months_high"]=test_set["Month"].map(lambda x: 1 if x>7 & x<12 else 0)

#type of traffic
test_set["Type_of_Traffic_high"]=test_set["Type_of_Traffic"].map(lambda x: 1 if (x in [7,8,15]) else 0)
test_set["Type_of_Traffic_med"]=test_set["Type_of_Traffic"].map(lambda x: 1 if (x in [10,11,2,5]) else 0)

test = test_set[['Product_Pages','GoogleAnalytics_ExitRate',
       'GoogleAnalytics_PageValue','Months_high',
       'OS_window', 'Type_of_Visitor_new', 'Type_of_Visitor_return',
       'Type_of_Traffic_high', 'Type_of_Traffic_med']]

In [172]:
#power transform
metric_features = ['GoogleAnalytics_ExitRate','GoogleAnalytics_PageValue']
non_metric_features = X.columns.drop(metric_features).to_list()
test_num = test[metric_features]
test_cat = test[non_metric_features]

In [173]:
test_num_power = power.transform(test_num)
test_num_power = pd.DataFrame(test_num_power, columns = test_num.columns).set_index(test_num.index)

test_power = pd.concat([test_num_power, test_cat], axis=1)

In [174]:
test_power.columns

Index(['GoogleAnalytics_ExitRate', 'GoogleAnalytics_PageValue', 'Months_high',
       'Type_of_Visitor_new'],
      dtype='object')

In [175]:
y_pred_new_threshold = (model_log6.predict_proba(test_power)[:,1]>=0.7).astype(int)

In [176]:
df_submission = pd.concat([test_set['Access_ID'], pd.DataFrame(columns=['Buy'], data=y_pred_new_threshold)], axis=1)

In [177]:
df_submission['Buy'].value_counts()

0    1853
1     447
Name: Buy, dtype: int64

In [146]:
#logistic_under with length 4_power_lessfeatures_newfeats_newthresh
df_submission.to_csv('Group17_Version26.csv', index=False)

#version 29, log9 at .78, RFE 3 variables, almost all training data: kaggle score .663
#version 28, log9 at .704349, RFE 3 variables, almost all training data: kaggle score .658
#version 27, log7 at .704349, kaggle score .678
#version 26, log6 at .7, kaggle score .69