In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the dataset
df1 = pd.read_csv('current_infant_brand.csv')
df1

Unnamed: 0,INDIVIDUAL_ID,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
0,1.000000e+12,,,300,1,1,CoReg,-59,0.093750,0.000000,0.0
1,1.000000e+12,,,870,1,2,Self Enrolled,-140,0.043956,0.000000,0.0
2,1.000000e+12,3,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.0
3,1.000000e+12,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.6
4,1.000000e+12,,,570,1,1,Self Enrolled,-98,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
29995,1.000060e+12,,1,360,1,1,Self Enrolled,205,0.125000,0.062500,0.0
29996,1.000060e+12,1,1,90,1,2,CoReg,-213,0.441176,0.058824,0.5
29997,1.000060e+12,,1,60,1,2,Self Enrolled,-212,0.333333,0.133333,0.0
29998,1.000060e+12,,3,300,3,1,CoReg,235,0.071429,0.000000,0.0


In [3]:
# drop the original individual id
df1 = df1.drop(columns='INDIVIDUAL_ID')
df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
0,,,300,1,1,CoReg,-59,0.093750,0.000000,0.0
1,,,870,1,2,Self Enrolled,-140,0.043956,0.000000,0.0
2,3,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.0
3,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.6
4,,,570,1,1,Self Enrolled,-98,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...
29995,,1,360,1,1,Self Enrolled,205,0.125000,0.062500,0.0
29996,1,1,90,1,2,CoReg,-213,0.441176,0.058824,0.5
29997,,1,60,1,2,Self Enrolled,-212,0.333333,0.133333,0.0
29998,,3,300,3,1,CoReg,235,0.071429,0.000000,0.0


In [4]:
# clean the "current brand" column
df1['cur_brand'].replace(' ', np.nan, inplace=True)
df1.dropna(subset=['cur_brand'], inplace=True)

for i in range(df1.shape[0]):
    if (df1.iloc[i, 0] != '1'):
      df1.iloc[i, 0] = 0
    else:
      df1.iloc[i, 0] = 1

df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,3,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,1,1,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
10,0,,1050,1,1,Self Enrolled,-49,0.100000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
29989,1,3,180,3,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,1,1,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,1,2,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,1,1,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [5]:
# clean the "first purchase brand" column
df1['first_buy_brand'].replace(' ', np.nan, inplace=True)
df1.dropna(subset=['first_buy_brand'], inplace=True)

for i in range(df1.shape[0]):
    if (df1.iloc[i, 1] != '1'):
      df1.iloc[i, 1] = 0
    else:
      df1.iloc[i, 1] = 1

df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,0,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,1,1,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
12,1,1,30,1,1,Self Enrolled,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,1,0,180,3,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,1,1,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,1,0,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,1,1,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [6]:
# clean the "hospital zone" column 
df1['ZONE'].replace(' ', np.nan, inplace=True)
df1.dropna(subset=['ZONE'], inplace=True)
df1['ZONE'] = df1['ZONE'].apply(pd.to_numeric)

for i in range(df1.shape[0]):
    if (df1.iloc[i, 3] != 1):
      df1.iloc[i, 3] = 0
      
df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,0,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,1,1,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
12,1,1,30,1,1,Self Enrolled,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,1,0,180,0,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,1,1,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,1,0,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,1,1,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [7]:
# clean the "breastfeed type" column
for i in range(df1.shape[0]):
    if (df1.iloc[i, 4] == 2) or (df1.iloc[i, 4] == 3):
      df1.iloc[i, 4] = 1
    else:
      df1.iloc[i, 4] = 0
 
df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,0,Self Enrolled,-247,0.000000,0.000000,0.000000
3,1,1,150,1,1,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,0,330,1,1,Self Enrolled,-64,0.714286,0.285714,0.000000
9,1,1,60,1,1,CoReg,-31,0.911765,0.205882,0.666667
12,1,1,30,1,0,Self Enrolled,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,1,0,180,0,1,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,1,1,0,1,1,CoReg,-241,0.782609,0.043478,0.500000
29991,1,0,30,1,1,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,1,1,30,1,1,Self Enrolled,-37,0.210526,0.078947,0.000000


In [8]:
# clean the "enrollment type column" 
for i in range(df1.shape[0]):
    if (df1.iloc[i, 5] == 'Self Enrolled'):
      df1.iloc[i, 5] = 1
    else:
      df1.iloc[i, 5] = 0
 
df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,0,1,-247,0.000000,0.000000,0.000000
3,1,1,150,1,1,1,-169,0.052632,0.000000,0.600000
7,0,0,330,1,1,1,-64,0.714286,0.285714,0.000000
9,1,1,60,1,1,0,-31,0.911765,0.205882,0.666667
12,1,1,30,1,0,1,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,1,0,180,0,1,1,-39,0.289474,0.105263,0.000000
29990,1,1,0,1,1,0,-241,0.782609,0.043478,0.500000
29991,1,0,30,1,1,1,-128,0.271186,0.050847,0.500000
29992,1,1,30,1,1,1,-37,0.210526,0.078947,0.000000


In [9]:
df1.reset_index(drop=True)

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
0,0,0,120,1,0,1,-247,0.000000,0.000000,0.000000
1,1,1,150,1,1,1,-169,0.052632,0.000000,0.600000
2,0,0,330,1,1,1,-64,0.714286,0.285714,0.000000
3,1,1,60,1,1,0,-31,0.911765,0.205882,0.666667
4,1,1,30,1,0,1,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
12888,1,0,180,0,1,1,-39,0.289474,0.105263,0.000000
12889,1,1,0,1,1,0,-241,0.782609,0.043478,0.500000
12890,1,0,30,1,1,1,-128,0.271186,0.050847,0.500000
12891,1,1,30,1,1,1,-37,0.210526,0.078947,0.000000


In [10]:
df1.to_csv('current_infant_brand_clean.csv', index=False)

## 20211228

In [2]:
# load the dataset
df = pd.read_csv('current_infant_brand.csv')
df

Unnamed: 0,INDIVIDUAL_ID,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
0,1.000000e+12,,,300,1,1,CoReg,-59,0.093750,0.000000,0.0
1,1.000000e+12,,,870,1,2,Self Enrolled,-140,0.043956,0.000000,0.0
2,1.000000e+12,3,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.0
3,1.000000e+12,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.6
4,1.000000e+12,,,570,1,1,Self Enrolled,-98,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
29995,1.000060e+12,,1,360,1,1,Self Enrolled,205,0.125000,0.062500,0.0
29996,1.000060e+12,1,1,90,1,2,CoReg,-213,0.441176,0.058824,0.5
29997,1.000060e+12,,1,60,1,2,Self Enrolled,-212,0.333333,0.133333,0.0
29998,1.000060e+12,,3,300,3,1,CoReg,235,0.071429,0.000000,0.0


In [3]:
# drop the original individual id
df = df.drop(columns='INDIVIDUAL_ID')
df

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
0,,,300,1,1,CoReg,-59,0.093750,0.000000,0.0
1,,,870,1,2,Self Enrolled,-140,0.043956,0.000000,0.0
2,3,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.0
3,1,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.6
4,,,570,1,1,Self Enrolled,-98,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...
29995,,1,360,1,1,Self Enrolled,205,0.125000,0.062500,0.0
29996,1,1,90,1,2,CoReg,-213,0.441176,0.058824,0.5
29997,,1,60,1,2,Self Enrolled,-212,0.333333,0.133333,0.0
29998,,3,300,3,1,CoReg,235,0.071429,0.000000,0.0


In [5]:
# clean the "current brand" column

# remove null
df['cur_brand'].replace(' ', np.nan, inplace=True)
df.dropna(subset=['cur_brand'], inplace=True)

# set 1 as 1 (our brand), other values as zero (other brand)
df['cur_brand'] = df['cur_brand'].apply(lambda x: 1 if x == 1 else 0)

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,3,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,0,1,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,3,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,0,1,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
10,0,,1050,1,1,Self Enrolled,-49,0.100000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
29989,0,3,180,3,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,0,1,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,0,2,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,0,1,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [7]:
# clean the "first purchase brand" column

# remove null
df['first_buy_brand'].replace(' ', np.nan, inplace=True)
df.dropna(subset=['first_buy_brand'], inplace=True)

# set 1 as 1 (our brand), other values as zero (other brand)
df['first_buy_brand'] = df['first_buy_brand'].apply(lambda x: 1 if x == 1 else 0)

df

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,0,0,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,0,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,0,0,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
12,0,0,30,1,1,Self Enrolled,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,0,0,180,3,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,0,0,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,0,0,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,0,0,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [8]:
# clean the "hospital zone" column 

# remove null
df['ZONE'].replace(' ', np.nan, inplace=True)
df.dropna(subset=['ZONE'], inplace=True)
df['ZONE'] = df['ZONE'].apply(pd.to_numeric)

df

#for i in range(df1.shape[0]):
 #   if (df1.iloc[i, 3] != 1):
  #    df1.iloc[i, 3] = 0


Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,breastfeed,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0
2,0,0,120,1,1,Self Enrolled,-247,0.000000,0.000000,0.000000
3,0,0,150,1,3,Self Enrolled,-169,0.052632,0.000000,0.600000
7,0,0,330,1,2,Self Enrolled,-64,0.714286,0.285714,0.000000
9,0,0,60,1,2,CoReg,-31,0.911765,0.205882,0.666667
12,0,0,30,1,1,Self Enrolled,9,0.250000,0.071429,0.250000
...,...,...,...,...,...,...,...,...,...,...
29989,0,0,180,3,2,Self Enrolled,-39,0.289474,0.105263,0.000000
29990,0,0,0,1,3,CoReg,-241,0.782609,0.043478,0.500000
29991,0,0,30,1,2,Self Enrolled,-128,0.271186,0.050847,0.500000
29992,0,0,30,1,2,Self Enrolled,-37,0.210526,0.078947,0.000000


In [13]:
# create dummy variables
df1 = pd.get_dummies(df, prefix='breastfeed_', columns=['breastfeed'])
df1 = df1.rename({'breastfeed__1': 'breastfed','breastfeed__2': 'breastfed_and_formula',
                  'breastfeed__3': 'formula', 'breastfeed__4': 'neither'}, axis=1)
df1

Unnamed: 0,cur_brand,first_buy_brand,babyage,ZONE,enroll_type2,enroll_age,open_rate0,click_rate0,redem_rate0,breastfed,breastfed_and_formula,formula,neither
2,0,0,120,1,Self Enrolled,-247,0.000000,0.000000,0.000000,1,0,0,0
3,0,0,150,1,Self Enrolled,-169,0.052632,0.000000,0.600000,0,0,1,0
7,0,0,330,1,Self Enrolled,-64,0.714286,0.285714,0.000000,0,1,0,0
9,0,0,60,1,CoReg,-31,0.911765,0.205882,0.666667,0,1,0,0
12,0,0,30,1,Self Enrolled,9,0.250000,0.071429,0.250000,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29989,0,0,180,3,Self Enrolled,-39,0.289474,0.105263,0.000000,0,1,0,0
29990,0,0,0,1,CoReg,-241,0.782609,0.043478,0.500000,0,0,1,0
29991,0,0,30,1,Self Enrolled,-128,0.271186,0.050847,0.500000,0,1,0,0
29992,0,0,30,1,Self Enrolled,-37,0.210526,0.078947,0.000000,0,1,0,0
