In [1]:
import os
import sys
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import scipy.stats as scs
import seaborn as sb
import dvc.api
import mlflow
import mlflow.sklearn
warnings.filterwarnings("ignore")

In [2]:
# set up paths and helper scripts
sys.path.append('.')
sys.path.insert(1, '../scripts/')

import defaults as defs
import dataCleaner as dc
import dataVisualizer as dv
import abTestHelper as th
import mlHelper as mlh

# setup helper scripts
cleaner = dc.dataCleaner('ml hypothesis testing notebook')
visualizer = dv.dataVisualizer('ml hypothesis testing notebook')
abTestHelper = th.abTestHelper('ml hypothesis testing notebook')
mlHelper = mlh.machineLearningHelper('ml hypothesis testing notebook')

logger <Logger dataCleaner (INFO)> created at path: ../logs/cleaner_root.log
Data cleaner in action
logger <Logger dataVisualizer (INFO)> created at path: ../logs/visualizer_root.log
Data visualizer in action
logger <Logger abTestHelper (INFO)> created at path: ../logs/hypothesis_test_root.log
Hypothesis test helper in action
logger <Logger mlHelper (INFO)> created at path: ../logs/ml_preprocess_root.log
Data preprocessor in action


In [5]:
# read data using dvc
version = 'v1'

# data path using dvc api
data_url = dvc.api.get_url(path = defs.path, 
                           repo = defs.repo, 
                           rev = version)

# reading the csv file
missing_values = ["n/a", "na", "undefined", '?', 'NA', 'undefined']
df = pd.read_csv(data_url, na_values=missing_values)
df

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,Samsung Internet,0,0


# ML hypothesis testing

## Preprocessing 

In [6]:
# setting up ml flow experiment
mlflow.set_experiment('data preparation')
mlflow.start_run()

mlflow.log_param('data url', data_url)
mlflow.log_param('data version', version)
mlflow.log_param('input_rows', df.shape[0])
mlflow.log_param('input_cols', df.shape[1])

### prepare data for ml

* Drop non BIO participants

In [8]:
drop_cols = df.query('yes == 0 and no ==0')
df = df.drop(drop_cols.index)
df

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1
...,...,...,...,...,...,...,...,...,...
8059,ffa08ff9-a132-4051-aef5-01a9c79367bc,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1,0
8063,ffb176df-ecd2-45d3-b05f-05b173a093a7,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1,0
8064,ffb79718-6f25-4896-b6b3-e58b80a6e147,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0,1
8069,ffca1153-c182-4f32-9e90-2a6008417497,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0,1


* Drop auction id

In [9]:
df = cleaner.remove_unwanted_cols(df, ['auction_id'])
df

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no
2,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
16,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
20,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
23,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0
27,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1
...,...,...,...,...,...,...,...,...
8059,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1,0
8063,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1,0
8064,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0,1
8069,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0,1


* Adding awareness column to the data set (target feature)

In [10]:
df['awareness'] = df.apply(lambda row: mlHelper.label_awareness(row), axis=1) 
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no,awareness
0,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1,0
1,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0,1
2,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1,0
3,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0,1
4,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1,0
...,...,...,...,...,...,...,...,...,...
1238,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1,0,1
1239,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1,0,1
1240,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0,1,0
1241,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0,1,0


* Save the awareness (target) added data set

In [11]:
df.to_csv(defs.local_path)
mlflow.log_artifact(defs.local_path)

* Transform categorical variables to numerical variables

In [12]:
# change categorical variables to numerical value
categorical_columns = (df.dtypes == object)
categorical_columns = categorical_columns[categorical_columns == True].index
df = mlHelper.encode_to_numeric(data = df, columns = categorical_columns)
df

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no,awareness
0,1,2,2,4,6,2,0,1,0
1,1,1,16,13,6,1,1,0,1
2,1,3,8,13,6,1,0,1,0
3,0,5,4,43,6,4,1,0,1
4,0,0,15,13,6,1,0,1,0
...,...,...,...,...,...,...,...,...,...
1238,1,2,21,13,6,1,1,0,1
1239,1,1,1,13,6,1,1,0,1
1240,0,6,7,13,6,1,0,1,0
1241,0,7,16,13,6,1,0,1,0


* Save the transformed data

In [13]:
df.to_csv(defs.local_path)
mlflow.log_artifact(defs.local_path)

* Split data to browser and platform os

* Browser

In [14]:
browser_df = df[["experiment", "hour", "date", 'device_make', 'browser', 'awareness']]
browser_df.to_csv(defs.local_path)
mlflow.log_artifact(defs.local_path)
browser_df

Unnamed: 0,experiment,hour,date,device_make,browser,awareness
0,1,2,2,4,2,0
1,1,16,1,13,1,1
2,1,8,3,13,1,0
3,0,4,5,43,4,1
4,0,15,0,13,1,0
...,...,...,...,...,...,...
1238,1,21,2,13,1,1
1239,1,1,1,13,1,1
1240,0,7,6,13,1,0
1241,0,16,7,13,1,0


* Platform os

In [15]:
platform_df = df[["experiment", "hour", "date", 'device_make', 'platform_os', 'awareness']]
platform_df.to_csv(defs.local_path)
mlflow.log_artifact(defs.local_path)
platform_df

Unnamed: 0,experiment,hour,date,device_make,platform_os,awareness
0,1,2,2,4,6,0
1,1,16,1,13,6,1
2,1,8,3,13,6,0
3,0,4,5,43,6,1
4,0,15,0,13,6,0
...,...,...,...,...,...,...
1238,1,21,2,13,6,1
1239,1,1,1,13,6,1
1240,0,7,6,13,6,0
1241,0,16,7,13,6,0


In [16]:
mlflow.end_run()

## Modeling