# Preprocessing für ML models
In this notebook some additional preprocessing steps for the machine learning data is done.

In [1]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'





### Read in the datasets.
First I need to read the prepared csv files to create the machine learning datasets.  Again I need the received dataframe and addtional the profile and portfolio datasets.

In [2]:
profile = pd.read_csv('data/clean_profile.csv')
portfolio = pd.read_csv('data/clean_portfolio.csv')
full_df = pd.read_csv('data/full.csv', low_memory=False)
received = pd.read_csv('data/received.csv', index_col=0)



In [3]:
received.columns
columns = ['person_id', 'ticks', 'received', 'viewed','completed',  'bogo', 'discount', 'informational']
received.columns.name='index'
received.loc[:, columns].head()

index,person_id,ticks,received,viewed,completed,bogo,discount,informational
0,68be06ca386d4c31939f3a4f0e3dd783,discount_2_10_7,1,1,-1,0.0,1.0,0.0
2,e2127556f4f64592b11af22de27a7932,discount_2_10_7,1,1,-1,0.0,1.0,0.0
4,8ec6ce2a7e7949b1bf142def7d0e0586,discount_2_10_7,1,1,-1,0.0,1.0,0.0
6,68617ca6246f4fbc85e91a2a49552598,discount_2_10_7,1,1,-1,0.0,1.0,0.0
8,389bc3fa690240e798340f5a15918d5c,discount_2_10_7,1,1,1,0.0,1.0,0.0


### Create a grouped dataframe for binary classification of completed labels

To create the machine learning dataset, there are the following process steps necessary. First I do this process for the question a customer completed an offer or not. 

* Extract the completed offers from the received dataframe and group it by customer and offer. 
* Extract the uncompleted offers from the received dataframe and group it by customer and offer. 
* Merge the dataframes by person id.
* Create a total column by sum up
* Create a ratio column by divide completed values by total values
* Create a binary target column from ratio column. Where ratio >= 0.5 binary target is 1, where ratio < 0.5 binary target is 0.

In [4]:
# group the received dataframe with column completed == 1 by customer and offer
df_completed = received[received.completed == 1].groupby(['person_id', 'ticks']).size().reset_index(name='completed_count')
# group the received dataframe with column completed == -1 by customer and offer
df_uncompleted = received[received.completed == -1].groupby(['person_id', 'ticks']).size().reset_index(name='uncompleted_count')

# merge the completed and uncompleted dataframes, use outer option because not all customers received all offers.
customer_offer = df_completed.merge(df_uncompleted, left_on=['person_id', 'ticks'], right_on=['person_id', 'ticks'], how='outer').fillna(0)

# Create a total column by sum(completed_count, uncompleted_count)
customer_offer['total'] = customer_offer.loc[:,['completed_count', 'uncompleted_count']].sum(axis=1)

# Create a ratio column by completed_count / sum(completed_count, uncompleted_count)
customer_offer['ratio'] = customer_offer.completed_count / customer_offer.loc[:,['completed_count', 'uncompleted_count']].sum(axis=1)

# Create a binary classification column
idx_0 = customer_offer[customer_offer['ratio'] < 0.5].index
idx_1 = customer_offer[customer_offer['ratio'] >= 0.5].index


customer_offer['binary_target'] = 2
customer_offer.loc[idx_0, 'binary_target'] = 0
customer_offer.loc[idx_1, 'binary_target'] = 1



# use pivot fuction to get an better overview
customer_offer_pivot = customer_offer.pivot(index='person_id', columns='ticks', values='binary_target').fillna('-')

In [11]:
customer_offer_pivot.iloc[:,0][customer_offer_pivot.iloc[:,0] == '-']

person_id
0009655768c64bdeb2e877511632db8f    -
00116118485d4dfda04fdbaba9a87b5c    -
0011e0d4e6b944f998e987f904e8c1e5    -
0020ccbbb6d84e358d3414a3ff76cffd    -
003d66b6608740288d6cc97a6903f4f0    -
                                   ..
ffeaa02452ef451082a0361c3ca62ef5    -
ffecb1f8543f4bf7bade023de366d6bf    -
fff3ba4757bd42088c044ca26d73817a    -
fffad4f4828548d1b5583907f2e9906b    -
ffff82501cea40309d5fdd7edcca4a07    -
Name: bogo_10_10_5, Length: 10664, dtype: object

In [5]:
print(customer_offer.shape)

customer_offer.columns.name='index'
customer_offer.head(3)

(63288, 7)


index,person_id,ticks,completed_count,uncompleted_count,total,ratio,binary_target
0,0009655768c64bdeb2e877511632db8f,bogo_5_5_5,1.0,0.0,1.0,1.0,1
1,0009655768c64bdeb2e877511632db8f,discount_2_10_10,1.0,0.0,1.0,1.0,1
2,0009655768c64bdeb2e877511632db8f,discount_2_10_7,1.0,0.0,1.0,1.0,1


This is the baseline machine learning input. In total there are 63288 unique customer offer combinations with the binary target the customer completed the offer or not. In the following steps the customer and offer features will be merged into the dataframe.

In [6]:
customer_offer.groupby('binary_target').size().reset_index(name='counts')

Unnamed: 0,binary_target,counts
0,0,26134
1,1,37154


In total I have 26134 not completed offers and 37154 completed offers. No let's merge the features in the dataframe

In [7]:

customer_offer_features = customer_offer.merge(profile, left_on='person_id', right_on='person_id')\
                                        .merge(portfolio, left_on='ticks', right_on='ticks')\

In [8]:
customer_offer_features.head(6)

Unnamed: 0,person_id,ticks,completed_count,uncompleted_count,total,ratio,binary_target,gender,age,became_member_on,...,duration,offer_type,offer_id,email,mobile,social,web,bogo,discount,informational
0,0009655768c64bdeb2e877511632db8f,bogo_5_5_5,1.0,0.0,1.0,1.0,1,M,33.0,2017-04-21,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0
1,0020ccbbb6d84e358d3414a3ff76cffd,bogo_5_5_5,1.0,0.0,1.0,1.0,1,F,24.0,2016-11-11,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0
2,004b041fbfe44859945daa2c7f79ee64,bogo_5_5_5,1.0,0.0,1.0,1.0,1,F,55.0,2018-05-08,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0
3,004c5799adbf42868b9cff0396190900,bogo_5_5_5,2.0,0.0,2.0,1.0,1,M,54.0,2016-03-31,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0
4,0091d2b6a5ea4defaa8393e4e816db60,bogo_5_5_5,2.0,0.0,2.0,1.0,1,F,62.0,2016-06-17,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0
5,0092a132ead946ceb30d11a1ed513d20,bogo_5_5_5,0.0,1.0,1.0,0.0,0,U,54.393524,2018-05-02,...,120,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,1,0,0


Now we have a dataframe with customer offer combinations, the corresponding labels and all available feature from customers and offers. Before we create a machine learning model we have to define which features we want to use. 

The following features will be part of the machine learning model.
* Customer Features
    * age, income, gender
* Offer Features
    * offer_type, email, mobile, social, web, reward, difficulty, duration, member_since_days

There are some more features available. Based on customers, this are date features like year, month, day. 
This additional customer features I do not want to use, because my machine learning model should be also usable for new customers. 
From event side it would be possible to consider the viewed label as feature for completed, but this feature I want to avoid and build an own machine learning model to predict the viewed label.

In [9]:
# Define feature columns
raw_features = ['age', 'income', 'F', 'M', 'O', 'U', 'reward', 'difficulty', 'duration', 'member_since_days',
                'bogo', 'discount', 'informational', 'email', 'mobile', 'social', 'web'] # one hot encoded features


labels = ['binary_target']




### Create a machine learning dataset with the given features

In [65]:
# Use the loc method to get the correct feature space
ml_data = customer_offer_features.loc[:, raw_features].copy()
ml_labels = customer_offer_features.loc[:, labels].copy()

In [20]:
ml_data.head(3)

Unnamed: 0,age,income,F,M,O,U,reward,difficulty,duration,member_since_days,bogo,discount,informational,email,mobile,social,web
0,33.0,72000.0,0,1,0,0,5,5,120,1630,1,0,0,1,1,1,1
1,24.0,60000.0,1,0,0,0,5,5,120,1791,1,0,0,1,1,1,1
2,55.0,74000.0,1,0,0,0,5,5,120,1248,1,0,0,1,1,1,1


In [21]:
ml_labels.head(3)

Unnamed: 0,binary_target
0,1
1,1
2,1


### Normalize columns for income, age

Now we have our machine learning dataset one more preprocessing step is necessary.
A good practice standard for machine learning data is to normalize continuous values.

In [66]:
from sklearn.preprocessing import MinMaxScaler

In [67]:
# create scaler object
scaler = MinMaxScaler()

# define data for scaler object
data = ml_data.loc[:,['income', 'age']]

# normalize the data with scaler object fit transform
normalized = scaler.fit_transform(data)


In [68]:
# assign normalized data to original columns
ml_data.loc[:,['income', 'age']] = normalized

In [71]:
ml_data.head(3)

Unnamed: 0,age,income,F,M,O,U,member_since_days,email,mobile,social,web
0,0.180723,0.466667,0,1,0,0,1630,1,1,1,1
1,0.072289,0.333333,1,0,0,0,1791,1,1,1,1
2,0.445783,0.488889,1,0,0,0,1248,1,1,1,1


Now the data is ready for export into a csv file for the machine learning process.

In [72]:
ml_data.to_csv('data/features_completed.csv', index=True)
ml_labels.to_csv('data/labels_completed.csv', index=True)


### Create the machine learning dataset to predict an offer viewed or not 

To create the dataset for viewed prediction, we remove the following offer affected features which are only relevant when the offer is already viewed.
* reward, difficulty, duration, bogo, discount, informational

The rest is the same process as for completed dataset.

### Create a grouped dataframe for binary classification of viewed labels

* Extract the viewed offers from the received dataframe and group it by customer and offer. 
* Extract the unviewed offers from the received dataframe and group it by customer and offer. 
* Merge the dataframes by person id.
* Create a total column by sum up
* Create a ratio column by divide completed values by total values
* Create a binary target column from ratio column. Where ratio >= 0.5 binary target is 1, where ratio < 0.5 binary target is 0.

In [73]:
# group the received dataframe with column completed == 1 by customer and offer
df_viewed = received[received.viewed == 1].groupby(['person_id', 'ticks']).size().reset_index(name='viewed_count')
# group the received dataframe with column completed == -1 by customer and offer
df_unviewed = received[received.viewed == -1].groupby(['person_id', 'ticks']).size().reset_index(name='unviewed_count')

# merge the completed and uncompleted dataframes, use outer option because not all customers received all offers.
customer_offer_viewed = df_viewed.merge(df_unviewed, left_on=['person_id', 'ticks'], right_on=['person_id', 'ticks'], how='outer').fillna(0)

# Create a total column by sum(completed_count, uncompleted_count)
customer_offer_viewed['total'] = customer_offer_viewed.loc[:,['viewed_count', 'unviewed_count']].sum(axis=1)

# Create a ratio column by completed_count / sum(completed_count, uncompleted_count)
customer_offer_viewed['ratio'] = customer_offer_viewed.viewed_count / customer_offer_viewed.loc[:,['viewed_count', 'unviewed_count']].sum(axis=1)

# Create a binary classification column
idx_0 = customer_offer_viewed[customer_offer_viewed['ratio'] < 0.5].index
idx_1 = customer_offer_viewed[customer_offer_viewed['ratio'] >= 0.5].index


customer_offer_viewed['binary_target'] = 2
customer_offer_viewed.loc[idx_0, 'binary_target'] = 0
customer_offer_viewed.loc[idx_1, 'binary_target'] = 1



# use pivot fuction to get an better overview
customer_offer_viewed_pivot = customer_offer_viewed.pivot(index='person_id', columns='ticks', values='binary_target')

In [74]:
customer_offer_viewed.head(6)

Unnamed: 0,person_id,ticks,viewed_count,unviewed_count,total,ratio,binary_target
0,0009655768c64bdeb2e877511632db8f,bogo_5_5_5,1.0,0.0,1.0,1.0,1
1,0009655768c64bdeb2e877511632db8f,discount_2_10_10,1.0,0.0,1.0,1.0,1
2,0009655768c64bdeb2e877511632db8f,informational_0_0_3,1.0,0.0,1.0,1.0,1
3,0009655768c64bdeb2e877511632db8f,informational_0_0_4,1.0,0.0,1.0,1.0,1
4,00116118485d4dfda04fdbaba9a87b5c,bogo_5_5_5,2.0,0.0,2.0,1.0,1
5,0011e0d4e6b944f998e987f904e8c1e5,bogo_5_5_7,1.0,0.0,1.0,1.0,1


Now I merge the customer and offer features in the dataframe

In [76]:

customer_offer_viewed_features = customer_offer_viewed.merge(profile, left_on='person_id', right_on='person_id')\
                                        .merge(portfolio, left_on='ticks', right_on='ticks')\

My raw features are reduced to the completed dataset. From offers only the channels for offers received are relevant.

In [43]:
# Define feature columns
raw_features = ['age', 'income', 'F', 'M', 'O', 'U', 'member_since_days', 
                'email', 'mobile', 'social', 'web'] 

labels = ['binary_target']





### Create a machine learning dataset with the given features

In [44]:
ml_data_viewed = customer_offer_viewed_features.loc[:, raw_features].copy()
ml_labels_viewed = customer_offer_viewed_features.loc[:, labels].copy()

In [45]:
ml_data_viewed.head(3)

Unnamed: 0,age,income,F,M,O,U,member_since_days,email,mobile,social,web
0,33.0,72000.0,0,1,0,0,1630,1,1,1,1
1,54.393524,65404.991568,0,0,0,1,1261,1,1,1,1
2,24.0,60000.0,1,0,0,0,1791,1,1,1,1


In [46]:
ml_labels_viewed.head(3)

Unnamed: 0,binary_target
0,1
1,1
2,1


### Normalize columns for income, age

A good practice standard for machine learning data is to normalize continuous values.

In [47]:
from sklearn.preprocessing import MinMaxScaler

In [48]:
# create scaler object
scaler = MinMaxScaler()

# define data for scaler object
data = ml_data_viewed.loc[:,['income', 'age']]

# normalize the data with scaler object fit transform
normalized = scaler.fit_transform(data)


In [49]:
# assign normalized data to original columns
ml_data_viewed.loc[:,['income', 'age']] = normalized

In [77]:
ml_data_viewed.head(3)

Unnamed: 0,age,income,F,M,O,U,member_since_days,email,mobile,social,web
0,0.180723,0.466667,0,1,0,0,1630,1,1,1,1
1,0.438476,0.393389,0,0,0,1,1261,1,1,1,1
2,0.072289,0.333333,1,0,0,0,1791,1,1,1,1


In [51]:
ml_data_viewed.to_csv('data/features_viewed.csv', index=True)
ml_labels_viewed.to_csv('data/labels_viewed.csv', index=True)
