In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME
from pandas_profiling import ProfileReport


# Read json files

In [2]:
alert_df=pd.read_json("../data/alerts_cleaned.json")

In [3]:
corazon_df= pd.read_json("../data/notification-labels-to-alert-surrogate-ids.json")

In [4]:
notification_df=pd.read_json("../data/notifications.json")

# Data Frame Cleaning

## Text managing 

In [5]:
!pip install -U spacy
!pip install -U pip setuptools wheel
!python -m spacy download es_core_news_sm
!pip install spacymoji
!python -m spacy download es_core_news_lg
import spacy
from spacymoji import Emoji
import string
import re
import es_core_news_lg
nlp = es_core_news_lg.load()

Collecting es-core-news-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.1.0/es_core_news_sm-3.1.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 1.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
Collecting es-core-news-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.1.0/es_core_news_lg-3.1.0-py3-none-any.whl (568.4 MB)
[K     |████████████████████████████████| 568.4 MB 40 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')


In [6]:
import emoji
def get_emoji(x):

    doc = nlp(x)
    l=[]
    for i in doc:
        i=i.orth_
        if i in emoji.UNICODE_EMOJI : 
            l.append(i)
    return l 

In [7]:
def rid_emoji(x,y):
    doc = nlp(x)
    l=[]
    for i in doc:
        i=i.orth_
        if i in y:
            pass
        else: 
            l.append(i)
    return ' '.join(l)

In [8]:
def slang_sep(x):
    if '?' in x:
        if x.count("?")>1:
            x=re.compile(r'[?|!]').split(x,x.count('?'))
            return ["".join(x[0]+ x[1]),x[2]]
        else:return re.compile(r'[?|!]').split(x,1)
    elif '!' in x:return re.compile(r'[?|!]').split(x,1) 
    else:  return ['',x]


In [9]:
def label_text (x,y):
    c=0
    for token in  nlp(x):
        if token.tag_== y: c=c+1
        else: pass  
    return c  

In [10]:
def text_rev (x):
    for token in  nlp(x):
        print([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_,
                 token.is_alpha, token.is_stop])

In [11]:
def words_count (x):
   return len(nlp(x))
    

In [12]:
alert_df['description']     = alert_df['description'].astype('string')
alert_df['lenght']          = alert_df['description'].str.len()
alert_df["emoji"]           = alert_df["description"].apply(lambda x: get_emoji(x))
alert_df["emoji_size"]      = alert_df["emoji"].str.len()
alert_df["txt_description"] = alert_df.apply(lambda x: rid_emoji(x.description,x.emoji),axis=1)

In [13]:
alert_df["slang"] = alert_df["txt_description"].apply(lambda x: slang_sep(x))
alert_df[['slang','information']] = pd.DataFrame(alert_df["slang"].tolist(), index= alert_df.index)
alert_df.drop(["txt_description"],axis=1, inplace= True)

In [14]:
alert_df["slang_char"] = alert_df["slang"].str.len()
alert_df["slang_verb"] = alert_df["slang"].apply(lambda x:  label_text (x,'VERB'  ))
alert_df["slang_pron"] = alert_df["slang"].apply(lambda x:  label_text (x,'PROPN' ))
alert_df["slang_adp" ] = alert_df["slang"].apply(lambda x:  label_text (x,'ADP'   ))
alert_df["slang_noun"] = alert_df["slang"].apply(lambda x:  label_text (x,'NOUN'  ))
alert_df["slang_num" ] = alert_df["slang"].apply(lambda x:  label_text (x,'NUM'   ))
alert_df["slang_punt"] = alert_df["slang"].apply(lambda x:  label_text (x,'PUNCT' ))
alert_df["slang_det" ] = alert_df["slang"].apply(lambda x:  label_text (x,'DET'   ))

In [15]:
alert_df["info_char" ]  = alert_df["information"].str.len()
alert_df["info_words"]  = alert_df["information"].apply(lambda x:  words_count (x))
alert_df["info_verb" ]  = alert_df["information"].apply(lambda x:  label_text  (x,'VERB'  ))
alert_df["info_pron" ]  = alert_df["information"].apply(lambda x:  label_text  (x,'PROPN' ))
alert_df["info_adp"  ]  = alert_df["information"].apply(lambda x:  label_text  (x,'ADP'   ))
alert_df["info_noun" ]  = alert_df["information"].apply(lambda x:  label_text  (x,'NOUN'  ))
alert_df["info_num"  ]  = alert_df["information"].apply(lambda x:  label_text  (x,'NUM'   ))
alert_df["info_punt" ]  = alert_df["information"].apply(lambda x:  label_text  (x,'PUNCT' ))
alert_df["info_det"  ]  = alert_df["information"].apply(lambda x:  label_text  (x,'DET'   ))

## Sentiment analysis

In [16]:
#!pip install transformers

In [17]:
#!pip3 install torch torchvision torchaudio

In [18]:
import transformers
import torch
from transformers import pipeline

nlp_ner = pipeline(
    "ner",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    tokenizer=(
        'mrm8488/bert-spanish-cased-finetuned-ner',  
        {"use_fast": False}
))
#text = 'Mis amigos están pensando viajar a Londres este verano'

#nlp_ner(text)

## Merging alert df with agency dummy variables

In [19]:
alert2= alert_df[['surrogate_id','agency']].set_index('surrogate_id')
alert2= pd.get_dummies(alert2)
grouped_alert= alert2.groupby(by='surrogate_id').sum()
alert_df.drop(['agency'], axis=1, inplace=True)
alert_df.drop_duplicates(subset='surrogate_id',inplace=True,ignore_index=True)
alert_df= pd.merge(alert_df,grouped_alert, on='surrogate_id')

## Merging notification df with corazon df

In [20]:
notification_df.drop_duplicates(inplace=True,ignore_index=True)
corazon_df.drop(['day'], axis=1, inplace=True)
corazon_df.drop_duplicates(inplace=True, ignore_index=True)
real_merge= notification_df.merge(corazon_df, how="left", left_on="join_key_value", right_on="notification_label_id")

## Merging all df together

In [21]:
real_merge.shape

(2274889, 8)

In [22]:
#Queried df for proper merging
label_merger= real_merge.query("join_field == 'label'")
alert_id_merger= real_merge.query("join_field == 'alertId'")

In [23]:
clean_df=label_merger.merge(alert_df, how="left",left_on="corazon_surrogate_id", right_on="surrogate_id")

In [24]:
clean_df_3=alert_id_merger.merge(alert_df, how="left",left_on="join_key_value", right_on="document_id")

In [25]:
clean_df_2=clean_df[clean_df['description'].isnull()]
clean_df.dropna(subset=['description'],inplace=True)
clean_df_2=clean_df_2[["event_date","event_timestamp", "event_name","user_id","join_field","join_key_value"]]
clean_df_4=clean_df_2.merge(alert_df, how="left",left_on="join_key_value", right_on="document_id")
clean_df_4.dropna(subset=['created_at'],inplace=True)

In [26]:
clean_df_3.dropna(subset=['description'],inplace=True)

In [27]:
almost_finished_df= pd.concat([clean_df,clean_df_4], axis=0)
finished_df= pd.concat([almost_finished_df,clean_df_3], axis=0)

## Dropping irrelevant columns

In [28]:
export_df=finished_df.drop(['event_date','join_field','join_key_value','is_global','corazon_surrogate_id','notification_label_id'], axis=1)

## Fixing the event_name column:

In [29]:
export_df["event_name"].replace("notification_receive","notification_received", inplace=True)

In [30]:
export_df["event_name"].replace("notification_open","notification_opened", inplace=True)

## Feature engineering

In [31]:
export_df["event_timestamp"]=pd.to_datetime(export_df["event_timestamp"]*1000, unit="ns")

In [32]:
export_df["notif_viewed_ontime"]=list(map(lambda x,y,z: np.nan if x=="notification_received" else (1 if y < z else 0), export_df["event_name"], export_df["event_timestamp"],export_df["closed_at"])) 
#1 if the user view it on time  0 if not  and null if hasnt yet seen it 

In [33]:
export_df.dropna (subset=['notif_viewed_ontime'], inplace=True)
export_df.reset_index(drop=True, inplace=True)
export_df["notif_viewed_ontime"]=export_df["notif_viewed_ontime"].astype(int)

In [34]:
export_df["reaction_time"]=list(map(lambda x,y,z,w: float(pd.Timedelta(y - z).seconds/60)
if (x==1) else float(pd.Timedelta(y - w).seconds/60), 
export_df["notif_viewed_ontime"],export_df["event_timestamp"],export_df["created_at"],export_df["closed_at"]))

In [35]:
export_df.loc[export_df['event_name'].isin(['notification_opened', 'notification_view_alternatives' ,  'notification_share']),'opened']= 1
export_df.loc[export_df['event_name'].isin(['notification_received', 'notification_dismiss']),'opened']= 0
export_df['opened']=export_df['opened'].astype(int)

In [36]:
gpbyuser= export_df.groupby( by= ['user_id'])
gpbyuser2=gpbyuser.sum()
gpbyuser2['count1'] = gpbyuser.size()
gpbyuser2.drop(gpbyuser2[gpbyuser2['count1'] <3 ].index, inplace=True)
real_users=gpbyuser2.index.tolist()
export_df=export_df[export_df['user_id'].isin(real_users)]

In [37]:
#Previous step to get the target opening ratio
gpbyincident= export_df.groupby(by= ['document_id'])
gpbyincident2=gpbyincident.sum()
gpbyincident2['count1'] = gpbyincident.size()
gpbyincident2['opened_rate'] = gpbyincident2['opened']/gpbyincident2['count1']
merge_column=gpbyincident2['opened_rate'].copy()

In [38]:
export_df= export_df.merge(merge_column, left_on="document_id", right_index=True)

In [39]:
export_df["interesting_message"]=list(map(lambda x: 1 if x>0.05 else 0, export_df["opened_rate"]))

In [40]:
export_df.columns

Index(['event_timestamp', 'event_name', 'user_id', 'document_id',
       'surrogate_id', 'created_at', 'published_at', 'closed_at', 'cause',
       'effect', 'description', 'area_of_effect_coordinates_latitude',
       'area_of_effect_coordinates_longitude', 'lenght', 'emoji', 'emoji_size',
       'slang', 'information', 'slang_char', 'slang_verb', 'slang_pron',
       'slang_adp', 'slang_noun', 'slang_num', 'slang_punt', 'slang_det',
       'info_char', 'info_words', 'info_verb', 'info_pron', 'info_adp',
       'info_noun', 'info_num', 'info_punt', 'info_det',
       'agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg',
       'agency_HE59N3RXM0q5vKu4AXlQZg', 'agency_JUR9bFXmVkWDHqu4AXaY0g',
       'agency_JfA8Bw8Zp024Kqu4AXiSpQ', 'agency_MgUq5b9mOEunx6u4AXt_BA',
       'agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw',
       'agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
       'agency_zCy9zG00HEqGeKu4AWZYNQ', 'notif_viewed_ontime', 'rea

In [41]:
export_df.rename(columns={'agency_GewRJAw5tUmC4Ku4AX1-SQ':'Sendero Segura','agency_GtvOEQAFZ0GtU6u4AXwvPg':'Red de Transporte de Pasajeros',
'agency_HE59N3RXM0q5vKu4AXlQZg':'Mexibús', 'agency_JUR9bFXmVkWDHqu4AXaY0g':'Metro', 'agency_JfA8Bw8Zp024Kqu4AXiSpQ':'Metrobús',
'agency_MgUq5b9mOEunx6u4AXt_BA':'Mexicable', 'agency_NuuRQ2I1Q0a50Kv-AVKlLA':'Trolebús','agency_V2AIQQKgmUO3VazvAOA-Cw':'Cablebús',
'agency_jLjibFoim0iwWau4AWoEdQ':'Tren Suburbano', 'agency_pky7jovXYkaw-awAAMrQ3g':'Tren Ligero', 'agency_zCy9zG00HEqGeKu4AWZYNQ':'Camión, Microbús, Combi',
}, inplace=True)

In [42]:
export_df=pd.get_dummies(data=export_df,columns= ["cause","effect"],drop_first=True)
export_df.drop(['event_timestamp', 'event_name', 'user_id', 'document_id','surrogate_id', 'created_at', 'published_at', 
'closed_at','notif_viewed_ontime', 'reaction_time','area_of_effect_coordinates_latitude','area_of_effect_coordinates_longitude',
'opened','opened_rate','description'],axis=1, inplace=True)

## Exporting the df to a json file, so it is easier to deal without the cleaning part.

In [43]:
export_df.reset_index(inplace=True, drop=True)

In [44]:
#export_df.to_json("../data/cleaned_data.json",orient='columns')

In [45]:
!pip3 install pyarrow
import pyarrow.feather as feather
feather.write_feather(export_df, "../data/cleaned_data.feather")



In [46]:
export_df = feather.read_feather("../data/cleaned_data.feather")