In [53]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score



df = pd.read_csv("../data/AdSmartABdata.csv")

In [54]:
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [55]:
df.describe()

Unnamed: 0,hour,platform_os,yes,no
count,8077.0,8077.0,8077.0,8077.0
mean,11.61508,5.947134,0.070818,0.083075
std,5.734879,0.224333,0.256537,0.276013
min,0.0,5.0,0.0,0.0
25%,7.0,6.0,0.0,0.0
50%,13.0,6.0,0.0,0.0
75%,15.0,6.0,0.0,0.0
max,23.0,7.0,1.0,1.0


In [56]:
# Prints Summary of Categorical Data
df.describe(include=[np.object])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df.describe(include=[np.object])


Unnamed: 0,auction_id,experiment,date,device_make,browser
count,8077,8077,8077,8077,8077
unique,8077,2,8,270,15
top,3bef7c2e-2ce3-488d-bc62-fa533ef72725,control,2020-07-03,Generic Smartphone,Chrome Mobile
freq,1,4071,2015,4743,4554


In [62]:
categorical = []
numerical = []
for col in df.columns:
  if df[col].dtype == object:
    categorical.append(col)
  elif df[col].dtype in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
    numerical.append(col)

In [63]:
features = categorical + numerical 
data = df[features]
data.head()

Unnamed: 0,auction_id,experiment,date,device_make,browser,hour,platform_os,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,Generic Smartphone,Chrome Mobile,8,6,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,Generic Smartphone,Chrome Mobile,10,6,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,E5823,Chrome Mobile WebView,2,6,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,Samsung SM-A705FN,Facebook,15,6,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,Generic Smartphone,Chrome Mobile,15,6,0,0


Preprocessing
Outlier detection

In [64]:
from scipy import stats
# check if they exist
def iqr_outlier_test(data, col):
  Q1 = np.percentile(data[col], 25, interpolation = 'midpoint')  
  Q2 = np.percentile(data[col], 50, interpolation = 'midpoint')  
  Q3 = np.percentile(data[col], 75, interpolation = 'midpoint')  
  IQR = stats.iqr(data[col], interpolation = 'midpoint') 
  o = (data[col] < (Q1 - 1.5 * IQR)) |(data[col] > (Q3 + 1.5 * IQR))
  m = o.unique()
  return m

In [30]:
for col in df[numerical].columns:
  print(col, '-', iqr_outlier_test(df, col))


hour - [False]
platform_os - [False  True]
yes - [False  True]
no - [False  True]


3 numerical features have outliers.They cannot be treated since they are cateorical features.

### Splitting the columns for one hot encoding and label encoding 

In [65]:
import datetime
df.date = pd.to_datetime(df["date"])  #convert date to datetime object

In [66]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical if data[col].nunique() <= 10 and data[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)

To One Hot Encoding: ['date']
To Label Encoding: ['auction_id', 'experiment', 'device_make', 'browser']


In [36]:
data.isnull().sum()

auction_id     0
experiment     0
device_make    0
browser        0
hour           0
platform_os    0
yes            0
no             0
dtype: int64

In [42]:
# Feature generation
df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')  # convert date to datetime object
# df['day']=df['date'].dt.day                       #extract the day
df['dayofweek_num']=df['date'].dt.dayofweek       # extract the day of the week

In [43]:
df = df.drop(['date'], axis = 1)  #drop  the date col
df = df.drop(['auction_id'], axis = 1)  #drop  the auction_id col
df.tail(5)

Unnamed: 0,experiment,hour,device_make,platform_os,browser,yes,no,dayofweek_num
8072,exposed,7,Generic Smartphone,6,Chrome Mobile,0,0,6
8073,control,15,Generic Smartphone,6,Chrome Mobile,0,0,4
8074,control,9,Generic Smartphone,6,Chrome Mobile,0,0,5
8075,exposed,15,Samsung SM-A515F,6,Samsung Internet,0,0,6
8076,control,14,Samsung SM-G960F,6,Facebook,0,0,4


In [67]:
print(data.shape)
data.dtypes

(8077, 9)


auction_id     object
experiment     object
date           object
device_make    object
browser        object
hour            int64
platform_os     int64
yes             int64
no              int64
dtype: object

In [68]:
# get the location of the 3 categorical columns
features = df.copy()
indices = []
for col in ['browser', 'experiment', 'device_make']:
    k = features.columns.get_loc(col)
    indices.append(k)
    
indices

[4, 1, 3]

In [69]:
# Encoding categorical variables using Label Encoder
columns = indices
for col in columns:
    x = features.iloc[:, col].values
    x = x.reshape(-1,1)
    encoder = LabelEncoder()
    encoder = encoder.fit(x)
    x = encoder.transform(x)
    features.iloc[:, col] = x 

  return f(*args, **kwargs)


In [70]:
print(features.shape)
features.head()

(8077, 9)


Unnamed: 0,auction_id,experiment,date,device_make,browser,hour,platform_os,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,1,2020-07-10,46,2,8,6,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,1,2020-07-07,46,2,10,6,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,1,2020-07-05,29,3,2,6,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,0,2020-07-03,137,6,15,6,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,0,2020-07-03,46,2,15,6,0,0


In [71]:
features['target'] = 0
features.loc[features['yes'] ==1, 'target'] = 1
features.loc[features['no'] ==1, 'target'] = 2
features = features.drop(['yes', 'no'], axis = 1)
# features = features[features.target != 0]
# features.loc[features['target'] ==2, 'target'] = 0
print(features.shape)
features.target.value_counts()

features.head()

(8077, 8)


Unnamed: 0,auction_id,experiment,date,device_make,browser,hour,platform_os,target
0,0008ef63-77a7-448b-bd1e-075f42c55e39,1,2020-07-10,46,2,8,6,0
1,000eabc5-17ce-4137-8efe-44734d914446,1,2020-07-07,46,2,10,6,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,1,2020-07-05,29,3,2,6,2
3,00187412-2932-4542-a8ef-3633901c98d9,0,2020-07-03,137,6,15,6,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,0,2020-07-03,46,2,15,6,0


In [72]:
# dependent and independent variables
x = features.drop(['target'], axis = 1)
y = features[['target']]

In [73]:
# split dataset to train and test sets (90:10)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .1, random_state = 0)
print('x train', x_train.shape)
print('y train', y_train.shape)
print('x test', x_test.shape)
print('y test', y_test.shape)

x train (7269, 7)
y train (7269, 1)
x test (808, 7)
y test (808, 1)


In [74]:
# get the validation set from the train set (70:20)

# the % changes to 22 to be representative of the 20 expected originally
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size = .22, random_state = 0)
print('x train', x_train.shape)
print('y train', y_train.shape)
print('x validation', x_val.shape)
print('y validation', y_val.shape)
print('x test', x_test.shape)
print('y test', y_test.shape)

x train (5669, 7)
y train (5669, 1)
x validation (1600, 7)
y validation (1600, 1)
x test (808, 7)
y test (808, 1)


Logistic Regression

In [81]:
log = LogisticRegression()
log.fit(x_train, y_train)

ValueError: could not convert string to float: '90eaf6b8-e48c-40f1-a286-762a9b64ba9a'

ValueError: could not convert string to float: '90eaf6b8-e48c-40f1-a286-762a9b64ba9a'