In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from itertools import product
pd.get_option("display.max_columns")
warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':(10,5)})

## 1. Import data

In [2]:
df = pd.read_csv('df_eda')
pd.options.display.max_columns = None
df

Unnamed: 0,cur_brand2,pre_brand2,zone2,edu2,pro2,enroll_type2,enroll_age,age_period,open_rate,ctr,or_all,ctr_all,cur_brand3
0,MJN,MJN Solutions,Others,Elementary School,Ontario,Self Enrolled,831,Stage 3,1.0,0.5,0.63,0.06,1
1,Nestle,Parent's Choice Stage 1,Others,High School,Ontario,Self Enrolled,831,Stage 3,0.0,0.0,0.09,0.03,0
2,MJN,MJN Stage1,MJN,Elementary School,Alberta,Self Enrolled,519,Stage 3,0.0,0.0,0.00,0.00,1
3,MJN,Abbott Stage1,MJN,Elementary School,British Columbia,Self Enrolled,655,Stage 3,0.0,0.0,0.00,0.00,1
4,MJN,MJN Stage1,MJN,Elementary School,Nova Scotia,Self Enrolled,-1226,Stage 0,0.0,0.0,0.04,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,Others,Abbott Stage1,Others,Elementary School,Saskatchewan,CoReg,167,Stage 1,0.0,0.0,0.06,0.00,0
1903,Abbott,Abbott Solutions,Others,Elementary School,Quebec,Self Enrolled,191,Stage 2,0.0,0.0,0.50,0.17,0
1904,MJN,MJN Stage1,MJN,High School,British Columbia,Self Enrolled,149,Stage 1,0.0,0.0,0.47,0.03,1
1905,MJN,MJN Stage2,MJN,Elementary School,Ontario,Self Enrolled,239,Stage 2,0.0,0.0,0.04,0.04,1


In [3]:
df = df.drop(columns=['cur_brand2','open_rate','ctr','enroll_age'])

In [4]:
df

Unnamed: 0,pre_brand2,zone2,edu2,pro2,enroll_type2,age_period,or_all,ctr_all,cur_brand3
0,MJN Solutions,Others,Elementary School,Ontario,Self Enrolled,Stage 3,0.63,0.06,1
1,Parent's Choice Stage 1,Others,High School,Ontario,Self Enrolled,Stage 3,0.09,0.03,0
2,MJN Stage1,MJN,Elementary School,Alberta,Self Enrolled,Stage 3,0.00,0.00,1
3,Abbott Stage1,MJN,Elementary School,British Columbia,Self Enrolled,Stage 3,0.00,0.00,1
4,MJN Stage1,MJN,Elementary School,Nova Scotia,Self Enrolled,Stage 0,0.04,0.00,1
...,...,...,...,...,...,...,...,...,...
1902,Abbott Stage1,Others,Elementary School,Saskatchewan,CoReg,Stage 1,0.06,0.00,0
1903,Abbott Solutions,Others,Elementary School,Quebec,Self Enrolled,Stage 2,0.50,0.17,0
1904,MJN Stage1,MJN,High School,British Columbia,Self Enrolled,Stage 1,0.47,0.03,1
1905,MJN Stage2,MJN,Elementary School,Ontario,Self Enrolled,Stage 2,0.04,0.04,1


## 2. Create Dummy Variables

In [5]:
df['pre_brand2'].unique()

array(['MJN Solutions', "Parent's Choice Stage 1", 'MJN Stage1',
       'Abbott Stage1', 'Nestle Stage 1', "Parent's Choice Stage 2",
       'Abbott Solutions', 'Nestle Solutions', 'Nestle Stage 2',
       'Abbott Stage2', 'MJN Stage2', 'Kirkland Stage 1',
       'Abbott Specialty', "President's Choice Stage 2",
       'Kirkland Solutions', 'Others', 'MJN Specialty'], dtype=object)

In [6]:
# create dummy variables for "pre_brand"
df1 = pd.get_dummies(df, prefix='pre_brand2_', columns=['pre_brand2'])
df1 = df1.rename({'pre_brand_11': 'MJN_Stage1','pre_brand_12': 'MJN_Stage12', 
                  'pre_brand_13': 'MJN_Solutions','pre_brand_14': 'MJN_Specialty', 
                  'pre_brand_21': 'Nestle_Stage1','pre_brand_22': 'Nestle_Stage2', 
                  'pre_brand_23': 'Nestle_Solutions','pre_brand_31': 'Abbott_Stage1', 
                  'pre_brand_32': 'Abbott_Stage2','pre_brand_33': 'Abbott_Solutions',
                  'pre_brand_34': 'Abbott_Specialty','pre_brand_41': 'Kirkland_Stage1',
                  'pre_brand_42': 'Kirkland_Solutions','pre_brand_51': "Parents_Choice_Stage1",
                  "Parent's Choice Stage 2": "Parents_Choice_Stage2",'pre_brand_61': "Presidents_Choice_Stage2",
                  'pre_brand_71': "Others"
                 }, axis=1)

In [7]:
display(df1)

Unnamed: 0,zone2,edu2,pro2,enroll_type2,age_period,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2
0,Others,Elementary School,Ontario,Self Enrolled,Stage 3,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,Others,High School,Ontario,Self Enrolled,Stage 3,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,MJN,Elementary School,Alberta,Self Enrolled,Stage 3,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,MJN,Elementary School,British Columbia,Self Enrolled,Stage 3,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,MJN,Elementary School,Nova Scotia,Self Enrolled,Stage 0,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,Others,Elementary School,Saskatchewan,CoReg,Stage 1,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1903,Others,Elementary School,Quebec,Self Enrolled,Stage 2,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1904,MJN,High School,British Columbia,Self Enrolled,Stage 1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1905,MJN,Elementary School,Ontario,Self Enrolled,Stage 2,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [8]:
print(df1)

       zone2               edu2              pro2   enroll_type2 age_period  \
0     Others  Elementary School           Ontario  Self Enrolled    Stage 3   
1     Others        High School           Ontario  Self Enrolled    Stage 3   
2        MJN  Elementary School           Alberta  Self Enrolled    Stage 3   
3        MJN  Elementary School  British Columbia  Self Enrolled    Stage 3   
4        MJN  Elementary School       Nova Scotia  Self Enrolled    Stage 0   
...      ...                ...               ...            ...        ...   
1902  Others  Elementary School      Saskatchewan          CoReg    Stage 1   
1903  Others  Elementary School            Quebec  Self Enrolled    Stage 2   
1904     MJN        High School  British Columbia  Self Enrolled    Stage 1   
1905     MJN  Elementary School           Ontario  Self Enrolled    Stage 2   
1906     MJN  Elementary School           Ontario  Self Enrolled    Stage 0   

      or_all  ctr_all  cur_brand3  pre_brand2__Abbo

In [9]:
# set the "zone" column
df1['zone2'] = df1['zone2'].apply(lambda x: 1 if x == "MJN" else 0)
df1

Unnamed: 0,zone2,edu2,pro2,enroll_type2,age_period,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2
0,0,Elementary School,Ontario,Self Enrolled,Stage 3,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,High School,Ontario,Self Enrolled,Stage 3,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,Elementary School,Alberta,Self Enrolled,Stage 3,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,1,Elementary School,British Columbia,Self Enrolled,Stage 3,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,Elementary School,Nova Scotia,Self Enrolled,Stage 0,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,Elementary School,Saskatchewan,CoReg,Stage 1,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1903,0,Elementary School,Quebec,Self Enrolled,Stage 2,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1904,1,High School,British Columbia,Self Enrolled,Stage 1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1905,1,Elementary School,Ontario,Self Enrolled,Stage 2,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [10]:
df1['edu2'].unique()

array(['Elementary School', 'High School', 'Others'], dtype=object)

In [11]:
# create dummy variables for "edu2"
df1 = pd.get_dummies(df1, prefix='edu_', columns=['edu2'])
df1 = df1.rename({'edu1': 'elementary_school', 'edu2': 'high_school', 'edu3': 'others'}, axis=1)
df1

Unnamed: 0,zone2,pro2,enroll_type2,age_period,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2,edu__Elementary School,edu__High School,edu__Others
0,0,Ontario,Self Enrolled,Stage 3,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,Ontario,Self Enrolled,Stage 3,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,1,Alberta,Self Enrolled,Stage 3,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,1,British Columbia,Self Enrolled,Stage 3,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,Nova Scotia,Self Enrolled,Stage 0,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,Saskatchewan,CoReg,Stage 1,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1903,0,Quebec,Self Enrolled,Stage 2,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1904,1,British Columbia,Self Enrolled,Stage 1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1905,1,Ontario,Self Enrolled,Stage 2,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [12]:
df1['enroll_type2'].unique()

array(['Self Enrolled', 'CoReg', 'Purchased', 'Others'], dtype=object)

In [13]:
# create dummy variables for "enroll type"
df1 = pd.get_dummies(df1, prefix='enroll_type_', columns=['enroll_type2'])
df1 = df1.rename({'enroll_type1': 'self_enrolled', 'enroll_type2': 'coreg',
                  'enroll_type3': 'purchased','enroll_type4': 'others'}, axis=1)
df1

Unnamed: 0,zone2,pro2,age_period,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2,edu__Elementary School,edu__High School,edu__Others,enroll_type__CoReg,enroll_type__Others,enroll_type__Purchased,enroll_type__Self Enrolled
0,0,Ontario,Stage 3,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,0,Ontario,Stage 3,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
2,1,Alberta,Stage 3,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,1,British Columbia,Stage 3,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,1,Nova Scotia,Stage 0,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,Saskatchewan,Stage 1,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1903,0,Quebec,Stage 2,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1904,1,British Columbia,Stage 1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1905,1,Ontario,Stage 2,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [14]:
# create dummy variables for "age_period"
df1 = pd.get_dummies(df1, prefix='age_', columns=['age_period'])
df1 = df1.rename({'age0': 'stage0', 'age1': 'stage1',
                  'age2': 'stage2', 'age4': 'stage4'}, axis=1)
df1

Unnamed: 0,zone2,pro2,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2,edu__Elementary School,edu__High School,edu__Others,enroll_type__CoReg,enroll_type__Others,enroll_type__Purchased,enroll_type__Self Enrolled,age__Stage 0,age__Stage 1,age__Stage 2,age__Stage 3
0,0,Ontario,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
1,0,Ontario,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1
2,1,Alberta,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
3,1,British Columbia,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
4,1,Nova Scotia,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,Saskatchewan,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
1903,0,Quebec,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
1904,1,British Columbia,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
1905,1,Ontario,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0


In [15]:
df1['pro2'].unique()

array(['Ontario', 'Alberta', 'British Columbia', 'Nova Scotia',
       'New Brunswick', 'Quebec', 'Manitoba', 'Saskatchewan',
       'Newfoundland & Labrador', 'Northwest Territories', 'Others',
       'Prince Edward Island', 'Yukon'], dtype=object)

In [16]:
# create dummy variables for "age_period"
df1 = pd.get_dummies(df1, prefix='prov_', columns=['pro2'])
df1 = df1.rename({'pro1': 'ON', 'pro2': 'AB', 'pro3': 'BC',
                  'pro4': 'NS', 'pro5': 'NB', 'pro6': 'QC',
                  'pro7': 'MB', 'pro8': 'SK', 'pro9': 'NL',
                  'pro7': 'NT', 'pro8': 'PE', 'pro9': 'YT',                  
                  'pro10': 'Others'}, axis=1)
df1

Unnamed: 0,zone2,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2,edu__Elementary School,edu__High School,edu__Others,enroll_type__CoReg,enroll_type__Others,enroll_type__Purchased,enroll_type__Self Enrolled,age__Stage 0,age__Stage 1,age__Stage 2,age__Stage 3,prov__Alberta,prov__British Columbia,prov__Manitoba,prov__New Brunswick,prov__Newfoundland & Labrador,prov__Northwest Territories,prov__Nova Scotia,prov__Ontario,prov__Others,prov__Prince Edward Island,prov__Quebec,prov__Saskatchewan,prov__Yukon
0,0,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1903,0,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1904,1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1905,1,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [17]:
print(df1)

      zone2  or_all  ctr_all  cur_brand3  pre_brand2__Abbott Solutions  \
0         0    0.63     0.06           1                             0   
1         0    0.09     0.03           0                             0   
2         1    0.00     0.00           1                             0   
3         1    0.00     0.00           1                             0   
4         1    0.04     0.00           1                             0   
...     ...     ...      ...         ...                           ...   
1902      0    0.06     0.00           0                             0   
1903      0    0.50     0.17           0                             1   
1904      1    0.47     0.03           1                             0   
1905      1    0.04     0.04           1                             0   
1906      1    0.70     0.03           1                             0   

      pre_brand2__Abbott Specialty  pre_brand2__Abbott Stage1  \
0                                0            

### 3. ML

In [18]:
!pip3 install sklearn

You should consider upgrading via the '/Users/ben/Desktop/py_proj/stage1_2_retention/venv/bin/python -m pip install --upgrade pip' command.[0m


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

In [24]:
df1

Unnamed: 0,zone2,or_all,ctr_all,cur_brand3,pre_brand2__Abbott Solutions,pre_brand2__Abbott Specialty,pre_brand2__Abbott Stage1,pre_brand2__Abbott Stage2,pre_brand2__Kirkland Solutions,pre_brand2__Kirkland Stage 1,pre_brand2__MJN Solutions,pre_brand2__MJN Specialty,pre_brand2__MJN Stage1,pre_brand2__MJN Stage2,pre_brand2__Nestle Solutions,pre_brand2__Nestle Stage 1,pre_brand2__Nestle Stage 2,pre_brand2__Others,pre_brand2__Parent's Choice Stage 1,pre_brand2__Parent's Choice Stage 2,pre_brand2__President's Choice Stage 2,edu__Elementary School,edu__High School,edu__Others,enroll_type__CoReg,enroll_type__Others,enroll_type__Purchased,enroll_type__Self Enrolled,age__Stage 0,age__Stage 1,age__Stage 2,age__Stage 3,prov__Alberta,prov__British Columbia,prov__Manitoba,prov__New Brunswick,prov__Newfoundland & Labrador,prov__Northwest Territories,prov__Nova Scotia,prov__Ontario,prov__Others,prov__Prince Edward Island,prov__Quebec,prov__Saskatchewan,prov__Yukon
0,0,0.63,0.06,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0.09,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,0.00,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0.00,0.00,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0.04,0.00,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,0.06,0.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1903,0,0.50,0.17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1904,1,0.47,0.03,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1905,1,0.04,0.04,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [25]:
# train test split
# drop the "first_buy_brand since" it is highly correlated to cur_brand

df_x = df1.drop(columns= ['cur_brand3'])
X_train, X_test, y_train, y_test = train_test_split(df_x, 
                                                   df1['cur_brand3'],
                                                    test_size = 0.2,
                                                    stratify=df1['cur_brand3'], 
                                                    random_state=0)

In [26]:
clf_lr = LogisticRegression()

# cross validation
scores = cross_val_score(clf_lr, X_train, y_train, scoring='accuracy', cv=5)
print(scores)

# make cross validated predictions
y_pred = cross_val_predict(clf_lr, X_train, y_train, cv=5)
print(r2_score(y_train, y_pred))
print("%0.4f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.89508197 0.90163934 0.9147541  0.91147541 0.89508197]
0.6048832848048167
0.9036 accuracy with a standard deviation of 0.01


In [29]:
clf_lr.fit(X_train, y_train)
# predict 
y_pred_lr = clf_lr.predict(X_test)
print(metrics.classification_report(y_true=y_test, y_pred=y_pred_lr))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       221
           1       0.87      0.85      0.86       161

    accuracy                           0.88       382
   macro avg       0.88      0.88      0.88       382
weighted avg       0.88      0.88      0.88       382

