# Email Marketing Campaigns Optimization Model with NLP


### Jupyter Notebook 9/10

#### Importing Classes from the module `campaigns`

In [1]:
import sys
sys.path.append('..')


import warnings
warnings.filterwarnings('ignore')

# Get files
from campaigns.getinputdata import ClassifiedCampaignsFileGetInfo, CleanClassifiedCampaignsFileGetInfo

# Preprocessing
from campaigns.modeling.preprocessing import ClassifiedCampaignsGetPipeline

# Texts Preprocessing
from campaigns.modeling.nlp import MainModelTextsPreprocessor, MainModelFeatureExtractor

# Feature Engineering
from campaigns.modeling.preprocessing import BenchmarkCreator, Benchmarker

from sklearn.feature_extraction.text import CountVectorizer

# Texts Classification
from campaigns.modeling.classifiers import LogisticRegressionClassifier

# Evaluation
from campaigns.modeling.evaluation import MainModelEvaluator

### 4.3. Creating new variables: `open_rate_benchmark`, `ctr_benchmark`, `ctor_benchmark`, `open_rate_result`, `ctr_result`, `ctor_result`.



#### Getting Input

In [2]:
# Creating an instance

campaigns = ClassifiedCampaignsFileGetInfo('../../../datasets/output/sample_01/output_08/','es.csv')

#### Creating new variables

In [3]:
# Creating an instance

campaigns = BenchmarkCreator(campaigns.campaigns_w_vertical_df)

In [4]:
# Creating aux dataframe

aux = campaigns.benchmarks_aux_creator()
aux

Unnamed: 0,vertical,customer_cat,open_rate,ctr,ctor
0,0,1,0.52,0.24,0.45
1,0,2,0.62,0.31,0.5
2,0,3,0.67,0.3,0.43
3,0,4,0.74,0.35,0.46
4,0,5,0.82,0.44,0.53
5,1,1,0.45,0.12,0.31
6,1,2,0.61,0.26,0.41
7,1,3,0.73,0.34,0.48
8,1,4,0.81,0.39,0.48
9,1,5,0.84,0.47,0.55


#### Assiging its corresponding benchmark to each campaign

In [5]:
# Creating new variables: open_rate_benchmark, ctr_benchmark, ctor_benchmark

campaigns.benchmark_variable_creator(aux)

In [6]:
campaigns.campaigns_w_vertical_df[['sender', 'open_rate', 'ctr', 'ctor', 'open_rate_benchmark', 'ctr_benchmark', 'ctor_benchmark']]

Unnamed: 0,sender,open_rate,ctr,ctor,open_rate_benchmark,ctr_benchmark,ctor_benchmark
8,22216,0.44,0.12,0.28,0.76,0.40,0.52
13,862,0.41,0.14,0.38,0.70,0.38,0.54
18,83446,0.44,0.00,0.00,0.84,0.42,0.49
19,22216,0.56,0.12,0.22,0.76,0.40,0.52
23,66313,0.69,0.64,0.94,0.83,0.48,0.57
...,...,...,...,...,...,...,...
69700,61373,0.74,0.51,0.68,0.74,0.39,0.50
69701,60561,0.78,0.43,0.55,0.71,0.40,0.55
69702,20656,0.79,0.34,0.43,0.81,0.39,0.47
69703,46196,0.63,0.18,0.28,0.71,0.40,0.55


#### Detecting campaigns with low performance

- The campaign performs badly according to its benchmark `True`
- The campaign does not perform badly (obtains good results) `False`

In [7]:
# Creating an instance

campaigns = Benchmarker(campaigns.campaigns_w_vertical_df)

In [8]:
# Creating new variables: 'open_rate_result', 'ctr_result', 'ctor_result'

campaigns.campaign_benchmarker('open_rate', 'open_rate_benchmark')
campaigns.campaign_benchmarker('ctr','ctr_benchmark')
campaigns.campaign_benchmarker('ctor','ctor_benchmark')

In [9]:
campaigns.campaigns_w_vertical_df[['sender', 'vertical', 'open_rate','ctr', 'ctor','open_rate_benchmark','ctr_benchmark',
                                   'ctor_benchmark', 'open_rate_result','ctr_result','ctor_result']]

Unnamed: 0,sender,vertical,open_rate,ctr,ctor,open_rate_benchmark,ctr_benchmark,ctor_benchmark,open_rate_result,ctr_result,ctor_result
8,22216,5,0.44,0.12,0.28,0.76,0.40,0.52,True,True,True
13,862,6,0.41,0.14,0.38,0.70,0.38,0.54,True,True,True
18,83446,7,0.44,0.00,0.00,0.84,0.42,0.49,True,True,True
19,22216,5,0.56,0.12,0.22,0.76,0.40,0.52,True,True,True
23,66313,11,0.69,0.64,0.94,0.83,0.48,0.57,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
69700,61373,11,0.74,0.51,0.68,0.74,0.39,0.50,False,False,False
69701,60561,3,0.78,0.43,0.55,0.71,0.40,0.55,False,False,False
69702,20656,8,0.79,0.34,0.43,0.81,0.39,0.47,True,True,True
69703,46196,4,0.63,0.18,0.28,0.71,0.40,0.55,True,True,True


#### Obtaining information from campaigns with higher scores

In [10]:
# groupby vertical, filter rate, sort, get top n, add names

ntop = 50

open_rate_top = campaigns.get_top_messages('open_rate', ntop)
ctr_top = campaigns.get_top_messages('ctr', ntop)
ctor_top = campaigns.get_top_messages('ctor', ntop)

In [11]:
# ctor_top_keywords = campaigns.keywords_list_creator(ctor_top, ntop, 'wellness', 'message')

In [12]:
# ctor_top_keywords

In [13]:
top_keywords_list = ['Navidad', 'Año Nuevo', 'Magos']

In [14]:
len(top_keywords_list)

3

In [15]:
campaigns.add_keywords_to_file('../../../keywords/Wellness, health and personal care sectors/Ctor Result/keywords.txt', top_keywords_list)

### 4.4. Feature Selection with `ColumnTransformer`.

**Multiobjective problem** with three targets to predict: `open_rate_result`, `ctr_result` & `ctor_result`.

Steps:

- Dropping empty rows and normalizing texts from `subject` and `message` variables: dropping null values & stemming.


- Combining several feature extraction mechanisms and transformations into a single transformer in a scikit-learn pipeline using `ColumnTransformer` to predict target 01: `open_rate_result`.


- Packing everything in a script to predict target 02 & target 03: `ctr_result` and `ctor_result`.

#### Dropping empty rows and normalizing texts (stemming)

In [16]:
campaigns = MainModelTextsPreprocessor(campaigns.campaigns_w_vertical_df, 'spanish', remove_unlabeled=False)

In [17]:
campaigns.preprocess_message(flag_stemming=True)

In [18]:
campaigns.campaigns_w_vertical_df.columns

Index(['sender', 'subject', 'date_sent', 'total_sent', 'customer_cat', 'opens',
       'clicks', 'message', 'campaign_id', 'open_rate', 'ctr', 'ctor',
       'clean_subject', 'clean_message', 'vertical', 'open_rate_benchmark',
       'ctr_benchmark', 'ctor_benchmark', 'open_rate_result', 'ctr_result',
       'ctor_result'],
      dtype='object')

In [19]:
# Saving file

campaigns.save_final_df('../../../datasets/output/sample_01/output_09/', 'es.csv')

In [20]:
# Getting input
# Creating an instance

campaigns = CleanClassifiedCampaignsFileGetInfo('../../../datasets/output/sample_01/output_09/','es.csv')

#### Creating a Scikit-learn Pipeline

In [21]:
# Creating an instance

campaigns = ClassifiedCampaignsGetPipeline(campaigns.campaigns_w_vertical_df,
                                          features=campaigns.campaigns_w_vertical_df[['clean_subject', 'vertical']],
                                          target = campaigns.campaigns_w_vertical_df['open_rate_result'])

print('Feature dimensions: ', campaigns.features.ndim)
print('Number of messages: ', campaigns.features.shape)

print('Target 01 dimensions: ', campaigns.target.ndim)
print('Number of messages: ', campaigns.target.shape)

Feature dimensions:  2
Number of messages:  (79629, 2)
Target 01 dimensions:  1
Number of messages:  (79629,)


#### Feature Selection

- Performing one-hot encoding for `vertical`.

- Using countvectorizer to extract features from the text column.

Steps:

- Get categorical and text transfomers.
- Splitting in train and text datasets.
- Applying the transformers to features using `ColumnTransformer`.

#### Getting transformers

In [22]:
# Getting the transformer for the categorical variables

cat_transformer = campaigns.get_cat_transformer()
cat_transformer

Pipeline(steps=[('cat_imputer', SimpleImputer(strategy='constant')),
                ('cat_ohe', OneHotEncoder(handle_unknown='ignore'))])

In [23]:
# Getting the transformer for the text variables

text_transformer = campaigns.get_text_transformer()
text_transformer

Pipeline(steps=[('text_bow', CountVectorizer())])

#### Applying the transformers to features using `ColumnTransformer`

In [24]:
# Getting the column_transformer 

ct = campaigns.get_column_transformer(cat_transformer, text_transformer, 'clean_subject')
ct

ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('cat_imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('cat_ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['vertical']),
                                ('text',
                                 Pipeline(steps=[('text_bow',
                                                  CountVectorizer())]),
                                 'clean_subject')])

#### Splitting the dataset and fitting ColumnTransformer to the Pipeline

In [25]:
features_train, features_test, target_train, target_test = campaigns.train_test_splitter()

print(type(features_train))
print(type(features_test))
print(type(target_train))
print(type(target_test))

print(features_train.shape, target_train.shape)
print(features_test.shape, target_test.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
(71666, 2) (71666,)
(7963, 2) (7963,)


In [26]:
pipeline = campaigns.get_pipeline(ct)
pipeline

Pipeline(steps=[('feature_engineer',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('cat_ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['vertical']),
                                                 ('text',
                                                  Pipeline(steps=[('text_bow',
                                                                   CountVectorizer())]),
                                                  'clean_subject')])),
                ('LR', LogisticRegression())])

### 5. Predicting Targets

#### Fit and Predict

In [27]:
model = pipeline.fit(features_train, target_train)

In [28]:
model

Pipeline(steps=[('feature_engineer',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('cat_ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['vertical']),
                                                 ('text',
                                                  Pipeline(steps=[('text_bow',
                                                                   CountVectorizer())]),
                                                  'clean_subject')])),
                ('LR', LogisticRegression())])

In [29]:
predictions = pipeline.predict(features_test)

In [30]:
predictions

array([ True, False, False, ..., False, False, False])

### 6. Evaluating Performance

- We want to avoid treating true 0s as 1s.
- We want a classifier which predicts more 0s.

- Eval. criteria:

    - Predictions.mean shoudn't be greater than target_test.mean
    - Improve **precision** score.
    

In [31]:
evaluation = MainModelEvaluator(target_test, predictions)

In [32]:
evaluation.predictions_mean > evaluation.target_test_mean

False

In [33]:
evaluation.predictions_mean

0.40261208087404243

In [34]:
evaluation.target_test_mean

0.4450583950772322

- **Accuracy**:

A first approximation could be the % of examples that we got right.

In [35]:
print('accuracy %s' % evaluation.get_accuracy_score(evaluation.target_test))

accuracy 71.34245887228433


- **Confusion Matrix**:

In [36]:
tn, fp, fn, tp = evaluation.get_confusion_matrix(evaluation.target_test)


Confusion Matrix

 [[3447  972]
 [1310 2234]]


In [37]:
tn

3447

In [38]:
tp

2234

In [39]:
fn

1310

In [40]:
fp

972

- Overall, we want to improve **precision score**:

In [41]:
evaluation.get_recall_score(evaluation.target_test)

63.036117381489845

In [42]:
evaluation.get_precision_score(evaluation.target_test)

69.68184653774173

#### Saving the model

In [43]:
import pickle

with open('../../../datasets/pipelines/main_model_target_01_pipeline', 'wb') as f:
            pickle.dump(model, f)

In [44]:
#model = None

# with open('../../../datasets/main_model_target_01_pipeline', 'rb') as f:
            
#            model = pickle.load(f)

#### Predicting and evaluating performance of  target 02 and target 03: `ctr_result` and `ctor_result`

In [45]:
cd ../03_Scripts/

/Users/danielaperezduro/Desktop/TFM/emailmarketingwnlp/notebooks/03_Scripts


#### Target 2: `ctr_result`

In [46]:
!python 09_second_model_pipeline.py --input_root ../../../datasets/output/sample_01/output_09/ --input_file es.csv --features clean_message vertical --target ctr_result --text_column clean_message --pickle_file ../../../datasets/pipelines/main_model_target_02_pipeline 

Predictions mean:  0.4047469546653271
Target Test:  0.4361421574783373
Predictions mean is not greater than target test mean
accuracy 77.42057013688309

Confusion Matrix

 [[3716  774]
 [1024 2449]]
True Negatives:  3716
True Positives:  2449
False Negatives  1024
False Positives:  774
Recall Score:  70.51540454938093
Precision Score:  75.98510704312751


#### Target 3: `ctor_result`

In [47]:
!python 09_second_model_pipeline.py --input_root ../../../datasets/output/sample_01/output_09/ --input_file es.csv --features clean_message vertical --target ctor_result --text_column clean_message --pickle_file ../../../datasets/pipelines/main_model_target_03_pipeline 

Predictions mean:  0.36192389802838126
Target Test:  0.410398091171669
Predictions mean is not greater than target test mean
accuracy 77.04382770312696

Confusion Matrix

 [[3974  721]
 [1107 2161]]
True Negatives:  3974
True Positives:  2161
False Negatives  1107
False Positives:  721
Recall Score:  66.12607099143207
Precision Score:  74.98265093684941
